|
from transformers import AutoTokenizer |
|
from tqdm import tqdm |
|
import gradio as gr |
|
import pandas as pd |
|
from datasets import load_dataset |
|
import random |
|
from pathlib import Path |
|
|
|
initial_list_of_models = [ |
|
"riotu-lab/Aranizer-PBE-86k", |
|
"riotu-lab/Aranizer-PBE-64k", |
|
"riotu-lab/Aranizer-PBE-32k", |
|
"riotu-lab/Aranizer-SP-86k", |
|
"riotu-lab/Aranizer-SP-64k", |
|
"riotu-lab/Aranizer-SP-32k", |
|
"asafaya/bert-base-arabic", |
|
"inceptionai/jais-family-30b-16k", |
|
"Xenova/gpt-4o", |
|
"FreedomIntelligence/AceGPT-v1.5-13B-Chat", |
|
"FreedomIntelligence/AceGPT-13B", |
|
"Qwen/Qwen2.5-72B-Instruct", |
|
"microsoft/Phi-3-mini-128k-instruct", |
|
"unsloth/gemma-2b-bnb-4bit", |
|
"unsloth/Llama-3.3-70B-Instruct", |
|
"CohereForAI/c4ai-command-r-v01", |
|
"CohereForAI/c4ai-command-r-plus", |
|
"CohereForAI/aya-101", |
|
] |
|
|
|
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" |
|
if dataframe_path.exists(): |
|
df = pd.read_json(dataframe_path, lines=True) |
|
else: |
|
df = pd.DataFrame( |
|
columns=[ |
|
"👳 Tokenize Tashkeel", |
|
"📛 Models", |
|
"🪺 Fertility Score", |
|
"➕ Total Number of Tokens", |
|
"📘 Vocab Size", |
|
"Tokenizer Class", |
|
] |
|
) |
|
|
|
|
|
arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"] |
|
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"] |
|
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"] |
|
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3 |
|
print(f"Total number of samples: {len(all_data)}") |
|
all_text = " ".join(all_data) |
|
all_words = all_text.split() |
|
|
|
def benchmark_tokenizer(model_name) -> float: |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name, use_fast=True, trust_remote_code=True |
|
) |
|
vocab_size = tokenizer.vocab_size |
|
total_number_of_tokens = len(tokenizer.tokenize(all_text)) |
|
|
|
|
|
dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ" |
|
tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True) |
|
tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌" |
|
|
|
return { |
|
"👳 Tokenize Tashkeel": tashkeel_maintainer, |
|
"📛 Models": model_name, |
|
"🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3), |
|
"📘 Vocab Size": vocab_size, |
|
"➕ Total Number of Tokens": total_number_of_tokens, |
|
"Tokenizer Class": tokenizer.__class__.__name__, |
|
} |
|
|
|
|
|
for model_name in tqdm(initial_list_of_models): |
|
if model_name in df["📛 Models"].values: |
|
continue |
|
|
|
benchmark_data = benchmark_tokenizer(model_name) |
|
df = df._append(benchmark_data, ignore_index=True) |
|
|
|
|
|
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) |
|
|
|
|
|
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) |
|
|
|
|
|
def submit(model_name): |
|
global df |
|
if model_name in df["📛 Models"].values: |
|
return ( |
|
gr.Dataframe(df), |
|
gr.BarPlot(df), |
|
gr.Dropdown(choices=df["📛 Models"].tolist()), |
|
) |
|
benchmark_data = benchmark_tokenizer(model_name) |
|
df = df._append(benchmark_data, ignore_index=True) |
|
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) |
|
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) |
|
return ( |
|
gr.Dataframe(df), |
|
gr.BarPlot(df), |
|
gr.Dropdown(choices=df["📛 Models"].tolist()), |
|
) |
|
|
|
|
|
def generate_distinct_colors(n): |
|
"""Generate n visually distinct colors in hexadecimal format.""" |
|
if n > 256**3: |
|
raise ValueError("Cannot generate more than 16,777,216 unique colors.") |
|
|
|
|
|
|
|
spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3)) |
|
max_val = 256 - spacing |
|
|
|
|
|
used_colors = set() |
|
|
|
|
|
result = [] |
|
|
|
attempts = 0 |
|
while len(result) < n: |
|
|
|
r = random.randint(0, max_val) |
|
g = random.randint(0, max_val) |
|
b = random.randint(0, max_val) |
|
|
|
|
|
r = min(255, r * spacing) |
|
g = min(255, g * spacing) |
|
b = min(255, b * spacing) |
|
|
|
|
|
color = f"#{r:02X}{g:02X}{b:02X}" |
|
|
|
|
|
if color not in used_colors: |
|
used_colors.add(color) |
|
result.append(color) |
|
else: |
|
attempts += 1 |
|
if attempts > 50: |
|
|
|
spacing = max(1, spacing - 1) |
|
max_val = 256 - spacing |
|
attempts = 0 |
|
|
|
return result |
|
|
|
|
|
def decode_bpe_tokens(tokens): |
|
fixed_tokens = [] |
|
for token in tokens: |
|
|
|
if token.startswith("Ġ"): |
|
|
|
try: |
|
|
|
fixed_token = " " + token[1:].encode("utf-8").decode("utf-8") |
|
except UnicodeDecodeError: |
|
fixed_token = token |
|
else: |
|
try: |
|
|
|
fixed_token = token.encode("utf-8").decode("utf-8") |
|
except UnicodeDecodeError: |
|
fixed_token = token |
|
fixed_tokens.append(fixed_token) |
|
return fixed_tokens |
|
|
|
|
|
def tokenize_text(text, chosen_model, better_tokenization=False): |
|
tokenizer = AutoTokenizer.from_pretrained(chosen_model) |
|
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) |
|
random_colors = generate_distinct_colors(len(tokenized_text)) |
|
|
|
if better_tokenization: |
|
final_tokenized_text = [] |
|
for token in tokenized_text: |
|
correct_tokenized_text = "" |
|
for char in text: |
|
correct_tokenized_text += char |
|
current_token = decode_bpe_tokens( |
|
tokenizer.tokenize(correct_tokenized_text) |
|
) |
|
if current_token[0] == token: |
|
final_tokenized_text.append(correct_tokenized_text) |
|
text = text[len(correct_tokenized_text) :] |
|
break |
|
else: |
|
final_tokenized_text = tokenized_text |
|
print(final_tokenized_text) |
|
|
|
output = [] |
|
color_map = {} |
|
for idx, token in enumerate(final_tokenized_text): |
|
output.append((token, str(idx))) |
|
color_map[str(idx + 1)] = random_colors[idx % len(random_colors)] |
|
|
|
return gr.HighlightedText(output, color_map) |
|
|
|
|
|
def refresh(): |
|
global df |
|
df = pd.read_json(dataframe_path, lines=True) |
|
return ( |
|
gr.Dataframe(df), |
|
gr.BarPlot(df), |
|
gr.Dropdown(choices=df["📛 Models"].tolist()), |
|
) |
|
|
|
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner). |
|
|
|
**A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.** |
|
|
|
## Updates/Notes: |
|
1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)). |
|
1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better). |
|
1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no). |
|
1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens). |
|
1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`) |
|
1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better). |
|
|
|
**Note**: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving). |
|
""" |
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>") |
|
gr.Markdown("## What is the best tokenizer for Arabic?") |
|
gr.Markdown(leaderboard_description) |
|
with gr.Tab(label="Leaderboard"): |
|
dataframe = gr.Dataframe(df) |
|
with gr.Accordion("Barplot", open=False): |
|
barplot = gr.BarPlot( |
|
df, |
|
x="📛 Models", |
|
y="➕ Total Number of Tokens", |
|
x_title=" ", |
|
y_title=" ", |
|
width=1000, |
|
height=400, |
|
tooltip=["📘 Vocab Size", "🪺 Fertility Score"], |
|
vertical=False, |
|
x_label_angle=30, |
|
) |
|
model_name = gr.Textbox( |
|
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)" |
|
) |
|
with gr.Row(): |
|
submit_new_model_btn = gr.Button( |
|
value="Submit New Model", variant="primary", scale=3 |
|
) |
|
refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1) |
|
with gr.Tab(label="Try tokenizers"): |
|
text = gr.Textbox( |
|
label="Enter a text", |
|
lines=5, |
|
value="السلام عليكم ورحمة الله", |
|
rtl=True, |
|
text_align="right", |
|
) |
|
dropdown = gr.Dropdown( |
|
label="Select a model", |
|
choices=df["📛 Models"].tolist(), |
|
value=df["📛 Models"].tolist()[0], |
|
) |
|
with gr.Row(): |
|
submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3) |
|
checkbox = gr.Checkbox( |
|
label="Better tokenization for Arabic Text", value=False, scale=1 |
|
) |
|
tokenized_textbox = gr.HighlightedText(label="Tokenized text") |
|
|
|
submit_new_model_btn.click( |
|
submit, model_name, outputs=[dataframe, barplot, dropdown] |
|
) |
|
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown]) |
|
submit_text_btn.click( |
|
tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox] |
|
) |
|
|
|
|
|
demo.launch() |
|
|