llama-3-scores / app.py
shreeyad's picture
fix hindi lora path
fa3f4d0
raw
history blame
7.61 kB
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
class LLaMAScoreAnalyzer:
def __init__(self):
self.languages = ["Nepali", "Hindi"]
self.models = ["Baseline", "LoRA"]
self.scores_gpt = ["relevance_score", "cc_score", "syntax_score", "complete_score"]
self.rouge_bleu = ["rougeL", "bleu"]
self.categories = ["hallucination_type"]
self.DATA_PATH = {
"Nepali": {"Baseline": "./data/nepali_baseline_all_scores.csv", "LoRA": "./data/nepali_lora_all_scores.csv"},
"Hindi": {"Baseline": "./data/hindi_baseline_all_scores.csv", "LoRA": "./data/hindi_lora_all_scores.csv"}
}
def load_samples(self, lang):
cols_to_show = ["instruction", "input", "output"]
for model in self.DATA_PATH[lang]:
df = pd.read_csv(self.DATA_PATH[lang][model])
df.rename({"output": "expected_output"})
df[model+"_Response"] = df["cleaned_response"]
cols_to_show.append(model+"_Response")
cols_to_show = cols_to_show + ["relevance_score", "cc_score", "syntax_score", "complete_score", "rougeL", "blue", "is_repeat", "hallucination_type"]
df = df[[col for col in cols_to_show if col in df.columns]]
st.write(df.sample(5))
def load_data(self, lang, model):
df = pd.read_csv(self.DATA_PATH[lang][model])
df['Language'] = lang
df['Model'] = model
return df
def draw_specific_plots(self, data, categories, x_variable, title):
fig, ax = plt.subplots(figsize=(12, 6))
palette = sns.color_palette("pastel", len(categories) * len(data[x_variable].unique()))
for i, category in enumerate(categories):
for j, unique_value in enumerate(data[x_variable].unique()):
subset = data[data[x_variable] == unique_value]
sns.kdeplot(data=subset, x=category, fill=True, common_norm=False, alpha=0.5,
ax=ax, color=palette[i * len(data[x_variable].unique()) + j],
label=f"{category} ({unique_value})")
ax.set_title(title, fontsize=16)
ax.set_xlabel("Score", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.legend(title="Category (Language/Model)")
return fig
def draw_combined_density_plot(self, data, title):
fig, ax = plt.subplots(figsize=(12, 8))
palette = sns.color_palette("pastel", len(self.scores_gpt))
for i, category in enumerate(self.scores_gpt):
sns.kdeplot(data=data, x=category, fill=True, common_norm=False, alpha=0.5, ax=ax, label=category, color=palette[i])
ax.set_title(title, fontsize=16)
ax.set_xlabel("Score", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.legend(title="Score Categories")
return fig
def draw_bar_plot(self, data, categories, x_variable, title):
fig, axs = plt.subplots(len(categories), 1, figsize=(10, 6 * len(categories)))
palette = sns.color_palette("pastel", len(data))
if len(categories) == 1:
axs = [axs] # Ensure axs is iterable even for a single plot
for i, category in enumerate(categories):
sns.countplot(data=data, x=category, hue=x_variable, palette=palette, ax=axs[i])
axs[i].set_title(f"Distribution of {category} for {title}", fontsize=16)
axs[i].set_xlabel(category, fontsize=12)
axs[i].set_ylabel("Count", fontsize=12)
axs[i].legend(title=x_variable)
plt.tight_layout()
return fig
def score_analyzer(self):
st.sidebar.markdown("""
This App was created as a part of the project: "Fine-tuning LLaMA 3 with Low-Rank Adaptation for Nepali and Hindi"
""")
st.title("Findings from Fine-tuning LLaMA 3 with Low-Rank Adaptation for Nepali and Hind! ")
st.markdown("""
Full post here:
""")
show_samples = st.sidebar.checkbox("Show Sample Data", value=False)
detailed_view = st.sidebar.checkbox("Enable Detailed Charts View", value=False)
selected_languages = st.sidebar.multiselect("Select Languages", self.languages, default="Nepali")
selected_gpt_scoring = st.sidebar.multiselect("Select Score Category", self.scores_gpt, default="relevance_score")
selected_models = st.sidebar.multiselect("Select Models", self.models, default="Baseline")
dfs = []
for lang in selected_languages:
for model in selected_models:
df = self.load_data(lang, model)
dfs.append(df)
if show_samples:
for lang in selected_languages:
st.write(f"Sample data for {lang}")
self.load_samples(lang)
combined_data = pd.concat(dfs, ignore_index=True)
if detailed_view:
for language in selected_languages:
language_data = combined_data[combined_data['Language'] == language]
title = f"Distribution of Scores for {language}"
fig = self.draw_specific_plots(language_data, selected_gpt_scoring, 'Model', title)
st.pyplot(fig)
if len(selected_languages) > 1:
for model in selected_models:
model_data = combined_data[combined_data['Model'] == model]
title = f"Distribution of Scores for {model}"
fig = self.draw_specific_plots(model_data, selected_gpt_scoring, 'Language', title)
st.pyplot(fig)
st.sidebar.markdown("""
Show additional evaluation scores and categories below:
""")
additional_score_categories = st.sidebar.checkbox("Hallucination Statistics", value=False)
if additional_score_categories:
additional_categories = st.sidebar.multiselect("Select Category", self.categories, default="hallucination_type")
for language in selected_languages:
language_data = combined_data[combined_data['Language'] == language]
title = f"{language}"
fig = self.draw_bar_plot(language_data, additional_categories, 'Model', title)
st.pyplot(fig)
if len(selected_languages) > 1:
for model in selected_models:
model_data = combined_data[combined_data['Model'] == model]
title = f"{model}"
fig = self.draw_bar_plot(model_data, additional_categories, 'Language', title)
st.pyplot(fig)
else:
for language in selected_languages:
for model in selected_models:
title = f"Distribution of Scores for Different Evaluation Criterias for {language} [{model} Model]"
fig = self.draw_combined_density_plot(combined_data[(combined_data['Language'] == language) &
(combined_data['Model'] == model)], title)
st.pyplot(fig)
def main():
st.sidebar.header("Findings from Fine-tuning LLaMA 3 with Low-Rank Adaptation for Nepali and Hindi!")
analyzer = LLaMAScoreAnalyzer()
analyzer.score_analyzer()
if __name__ == "__main__":
main()