AigizK/bashkir-russian-parallel-corpora
Viewer • Updated • 1.2M • 64 • 15
How to use zhursvlevy/t5-small-bashkir-russian with Transformers:
# Use a pipeline as a high-level helper
# Warning: Pipeline type "translation" is no longer supported in transformers v5.
# You must load the model directly (see below) or downgrade to v4.x with:
# 'pip install "transformers<5.0.0'
from transformers import pipeline
pipe = pipeline("translation", model="zhursvlevy/t5-small-bashkir-russian") # Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("zhursvlevy/t5-small-bashkir-russian")
model = AutoModelForSeq2SeqLM.from_pretrained("zhursvlevy/t5-small-bashkir-russian")t5-small from google t5 repo fine-tuned on russian-bashkir corpora
BLEU: 0.3018
chrF: 0.5478
Use the example below*:
from typing import List, Union
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
@torch.inference_mode
def infer(
model: T5ForConditionalGeneration,
tokenizer: Union[T5TokenizerFast, T5Tokenizer],
device: str,
texts: List[str],
target_language: str,
max_length: int = 256
) -> List[str]:
assert target_language in ("русский", "башкирский"), "target language must be in (русский, башкирский)"
if target_language == "русский":
prefix = "башкирский-русский: "
else:
prefix = "русский-башкирский: "
text_with_prefix = [
prefix + (text[0].upper() + text[1:] + "." if not text.endswith(".") else text[0].upper() + text[1:]) \
for text in texts
]
inputs = tokenizer(
text_with_prefix,
padding="max_length",
max_length=256,
truncation=True,
return_tensors="pt"
)
model.eval()
outputs = model.generate(inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device))
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
if __name__ == "__main__":
tokenizer = T5Tokenizer.from_pretrained("zhursvlevy/t5-small-bashkir-russian")
model = T5ForConditionalGeneration.from_pretrained("zhursvlevy/t5-small-bashkir-russian")
input_text = "Тормоштоң, Ғаләмдең һәм бөтә нәмәнең төп һорауына яуап"
output_text = "Ответ на главный вопрос жизни, Вселенной и всего такого"
infer(model, tokenizer, "cpu", [input_text], "русский")
*The widget may not work correctly due to the default pipeline.