installing required libraries


In [None]:
!pip install datasets==2.14.5
!pip install transformers==4.28.0
!pip install protobuf==3.20.*

importing the dataset from hugging face and splitting it

In [None]:
from datasets import load_dataset

dataset = load_dataset("SKNahin/bengali-transliteration-data")

split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")


tokenizing the data and training the model

In [3]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Trainer, TrainingArguments
import torch

model_name = "facebook/mbart-large-50"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)


tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "bn_IN"


def preprocess(batch):
 inputs = tokenizer(batch["rm"], max_length=128, truncation=True, padding="max_length")
 targets = tokenizer(batch["bn"], max_length=128, truncation=True, padding="max_length")
 inputs["labels"] = targets["input_ids"]
 return inputs


train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)


train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


training_args = TrainingArguments(
 output_dir="./mbart_results",
 evaluation_strategy="epoch",
 learning_rate=3e-5,
 per_device_train_batch_size=2,
 per_device_eval_batch_size=2,
 num_train_epochs=5,
 weight_decay=0.01,
 save_total_limit=2,
 logging_dir="./mbart_logs",
 logging_steps=10,
 save_steps=500,
 fp16=torch.cuda.is_available(),
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 tokenizer=tokenizer,
)

trainer.train()


KeyboardInterrupt: 

evaluating the model and generating predictions

In [None]:
import torch

sample = val_dataset.select(range(10))
inputs = sample["input_ids"]

if torch.cuda.is_available():
 inputs = inputs.cuda()

preds = model.generate(inputs)

decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True) for pred in preds]
decoded_labels = [tokenizer.decode(label, skip_special_tokens=True, clean_up_tokenization_spaces=True) for label in sample["labels"]]

for i, (pred, label) in enumerate(zip(decoded_preds, decoded_labels)):
 print(f"Sample {i + 1}")
 print(f"Prediction: {pred}")
 print(f"Label: {label}\n")


saving the fine tuned model

In [None]:
model.save_pretrained("./banglish-to-bangla")
tokenizer.save_pretrained("./banglish-to-bangla")

taking custom input from the user to check

In [None]:
import torch

def translate_banglish_to_bangla(model, tokenizer, banglish_input):
 inputs = tokenizer(banglish_input, return_tensors="pt", padding=True, truncation=True, max_length=128)

 if torch.cuda.is_available():
 inputs = {key: value.cuda() for key, value in inputs.items()}
 model = model.cuda()

 translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["bn_IN"])
 translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

 return translated_text

print("Enter your Banglish text (type 'exit' to quit):")
while True:
 banglish_text = input("Banglish: ")
 if banglish_text.lower() == "exit":
 break


 translated_text = translate_banglish_to_bangla(model, tokenizer, banglish_text)
 print(f"Translated Bangla: {translated_text}\n")


exporting the model in .zip format

In [None]:
from google.colab import files
import zipfile

def zipdir(path, ziph):
 # ziph is zipfile handle
 for root, dirs, files in os.walk(path):
 for file in files:
 ziph.write(os.path.join(root, file))

import os
if not os.path.exists("./banglish-to-bangla"):
 print("Directory ./banglish-to-bangla not found. Please run the training code first.")
else:
 zipf = zipfile.ZipFile('banglish-to-bangla.zip', 'w', zipfile.ZIP_DEFLATED)
 zipdir('./banglish-to-bangla', zipf)
 zipf.close()
 files.download('banglish-to-bangla.zip')