Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

JiaenLiu commited on Oct 27, 2023

Commit

393add1

2 Parent(s): 0a00054 9387453

Merge pull request #51 from project-kxkg/evaluation

Browse files

Evaluation

Former-commit-id: ef5be7bdfbbb13908d071d9a8785d3b06be1143f

Files changed (10) hide show

README.md +2 -2
evaluation/alignment.py +139 -0
evaluation/evaluation.py +58 -0
evaluation/readme.md +24 -0
evaluation/scores/LLM_eval.py +121 -0
evaluation/scores/__init__.py +0 -0
evaluation/scores/multi_scores.py +63 -0
evaluation/scores/score.py +15 -0
requirement.txt +2 -0
src/srt_util/srt.py +13 -4

README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# PROJECT-T
 ## Installation
@@ -51,4 +51,4 @@ options:
 ## Notice
 if you cannot download youtube video, please follow the link below.
-https://github.com/pytube/pytube/issues/1498

+# Pigeon AI: Automatic Video Translation Toolkit
 ## Installation
 ## Notice
 if you cannot download youtube video, please follow the link below.
+https://github.com/pytube/pytube/issues/1498

evaluation/alignment.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import sys
+import numpy as np
+sys.path.append('../src')
+from srt_util.srt import SrtScript
+from srt_util.srt import SrtSegment
+# Helper method
+# Align sub anchor segment pair via greedy approach
+# Input: anchor segment, SRT segments, output array of sub, index of current sub
+# Output: updated index of sub
+def procedure(anchor, subsec, S_arr, subidx):
+    cache_idx = 0
+    while subidx != cache_idx:  # Terminate when alignment stablizes
+        cache_idx = subidx
+        # if sub segment runs out during the loop, terminate
+        if subidx >= len(subsec):
+            break
+        sub = subsec[subidx]
+        if anchor.end < sub.start:
+            continue
+        # If next sub has a heavier overlap compartment, add to current alignment
+        if (anchor.start <= sub.start) and (sub.end <= anchor.end) or anchor.end - sub.start > sub.end - anchor.start:
+            S_arr[-1] += sub#.source_text
+            subidx += 1
+    return subidx - 1  # Reset last invalid update from loop
+# Input: path1, path2
+# Output: aligned array of SRTsegment corresponding to path1 path2
+# Note: Modify comment with .source_text to get output array with string only
+def alignment_obsolete(pred_path, gt_path):
+    empt = SrtSegment([0,'00:00:00,000 --> 00:00:00,000','','',''])
+    pred = SrtScript.parse_from_srt_file(pred_path).segments
+    gt = SrtScript.parse_from_srt_file(gt_path).segments
+    pred_arr, gt_arr = [], []
+    idx_p, idx_t = 0, 0  # idx_p: current index of pred segment, idx_t for ground truth
+    while idx_p < len(pred) or idx_t < len(gt):
+        # Check if one srt file runs out while reading
+        ps = pred[idx_p] if idx_p < len(pred) else None
+        gs = gt[idx_t] if idx_t < len(gt) else None
+        if not ps:
+            # If ps runs out, align gs segment with filler one by one
+            gt_arr.append(gs)#.source_text
+            pred_arr.append(empt)
+            idx_t += 1
+            continue
+        if not gs:
+            # If gs runs out, align ps segment with filler one by one
+            pred_arr.append(ps)#.source_text
+            gt_arr.append(empt)
+            idx_p += 1
+            continue
+        ps_dur = ps.end - ps.start
+        gs_dur = gs.end - gs.start
+        # Check for duration to decide anchor and sub
+        if ps_dur <= gs_dur:
+            # Detect segment with no overlap
+            if ps.end < gs.start:
+                pred_arr.append(ps)#.source_text
+                gt_arr.append(empt)  # append filler
+                idx_t -= 1  # reset ground truth index
+            else:
+                if gs.end >= ps.start:
+                    gt_arr.append(gs)#.source_text
+                    pred_arr.append(ps)#.source_text
+                    idx_p = procedure(gs, pred, pred_arr, idx_p + 1)
+                else:
+                    gt_arr[len(gt_arr) - 1] += gs#.source_text
+                    #pred_arr.append(empt)
+                    idx_p -= 1
+        else:
+            # same overlap checking procedure
+            if gs.end < ps.start:
+                gt_arr.append(gs)#.source_text
+                pred_arr.append(empt)  # filler
+                idx_p -= 1  # reset
+            else:
+                if ps.end >= gs.start:
+                    pred_arr.append(ps)#.source_text
+                    gt_arr.append(gs)#.source_text
+                    idx_t = procedure(ps, gt, gt_arr, idx_t + 1)
+                else:  # filler pairing
+                    pred_arr[len(pred_arr) - 1] += ps
+                    idx_t -= 1
+        idx_p += 1
+        idx_t += 1
+    #for a in gt_arr:
+    #    print(a.translation)
+    return zip(pred_arr, gt_arr)
+# Input: path1, path2, threshold = 0.5 sec by default
+# Output: aligned array of SRTsegment corresponding to path1 path2
+def alignment(pred_path, gt_path, threshold=0.5):
+    empt = SrtSegment([0, '00:00:00,000 --> 00:00:00,000', '', '', ''])
+    pred = SrtScript.parse_from_srt_file(pred_path).segments
+    gt = SrtScript.parse_from_srt_file(gt_path).segments
+    pred_arr, gt_arr = [], []
+    idx_p, idx_t = 0, 0
+    while idx_p < len(pred) or idx_t < len(gt):
+        ps = pred[idx_p] if idx_p < len(pred) else empt
+        gs = gt[idx_t] if idx_t < len(gt) else empt
+        # Merging sequence for pred
+        while idx_p + 1 < len(pred) and pred[idx_p + 1].end <= gs.end + threshold:
+            ps += pred[idx_p + 1]
+            idx_p += 1
+        # Merging sequence for gt
+        while idx_t + 1 < len(gt) and gt[idx_t + 1].end <= ps.end + threshold:
+            gs += gt[idx_t + 1]
+            idx_t += 1
+        # Append to the result arrays
+        pred_arr.append(ps)
+        gt_arr.append(gs)
+        idx_p += 1
+        idx_t += 1
+    #for a in pred_arr:
+    #    print(a.translation)
+    #for a in gt_arr:
+    #    print(a.source_text)
+    return zip(pred_arr, gt_arr)
+#  Test Case
+#alignment('test_translation_s2.srt', 'test_translation_zh.srt')

evaluation/evaluation.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import argparse
+import pandas as pd
+from alignment import alignment
+from scores.multi_scores import multi_scores
+class Evaluator:
+    def __init__(self, pred_path, gt_path, eval_path, res_path):
+        self.pred_path = pred_path
+        self.gt_path = gt_path
+        self.eval_path = eval_path
+        self.res_path = res_path
+    def eval(self):
+        # Align two SRT files
+        aligned_srt = alignment(self.pred_path, self.gt_path)
+        # Get sentence scores
+        scorer = multi_scores()
+        result_data = []
+        for (pred_s, gt_s) in aligned_srt:
+            print("pred_s.source_text: ", pred_s.source_text)
+            print("pred_s.translation: ", pred_s.translation)
+            print("gt_s.source_text: ", gt_s.source_text)
+            scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.source_text)
+            print("scores_dict: ", scores_dict)
+            scores_dict['Source'] = pred_s.source_text
+            scores_dict['Prediction'] = pred_s.translation
+            scores_dict['Ground Truth'] = gt_s.source_text
+            result_data.append(scores_dict)
+        eval_df = pd.DataFrame(result_data)
+        eval_df.to_csv(self.eval_path, index=False, columns=['Source', 'Prediction', 'Ground Truth', 'bleu_score', 'comet_score', 'llm_score', 'llm_explanation'])
+        # Get average scores
+        avg_llm = eval_df['llm_score'].mean()
+        avg_bleu = eval_df['bleu_score'].mean()
+        avg_comet = eval_df['comet_score'].mean()
+        res_data = {
+            'Metric': ['Avg LLM', 'Avg BLEU', 'Avg COMET'],
+            'Score': [avg_llm, avg_bleu, avg_comet]
+        }
+        res_df = pd.DataFrame(res_data)
+        res_df.to_csv(self.res_path, index=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Evaluate SRT files.')
+    parser.add_argument('-bi_path', default='evaluation/test5_tiny/test5_bi.srt', help='Path to predicted SRT file')
+    parser.add_argument('-zh_path', default='evaluation/test5_tiny/test5_gt.srt', help='Path to ground truth SRT file')
+    parser.add_argument('-eval_output', default='evaluation/test5_tiny/eval.csv', help='Path to eval CSV file')
+    parser.add_argument('-res_output', default='evaluation/test5_tiny/res.csv', help='Path to result CSV file')
+    args = parser.parse_args()
+    evaluator = Evaluator(args.bi_path, args.zh_path, args.eval_output, args.res_output)
+    evaluator.eval()

evaluation/readme.md ADDED Viewed

	@@ -0,0 +1,24 @@

+Evaluation:
+BLEU (https://github.com/mjpost/sacrebleu)
+COMET (https://github.com/Unbabel/COMET)
+LLM eval
+Eval time stamp
+Sep 18 - Sep 25
+Proj-t
+src
+evaluation
+- scores
+- LLM_eval.py (jiaen)
+- scores.py (wizard)
+- comet
+- sacrebleu
+	- alignment.py (david)
+	- evaluation.py (not assigned)
+- results
+- mmddyy-HMS-results.csv
+- logs
+entry:
+Python3 evaluation/evaluation.py –pred path/to/pred –gt path/to/gt

evaluation/scores/LLM_eval.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# -*- coding: utf-8 -*-
+# This script is used to evaluate the performance of Pigeon AI Video Translation system by using Large Language Model.
+# Written by Jiaen LIU, 2023/09/18
+# Import the necessary packages
+import re
+from langchain.evaluation import load_evaluator, EvaluatorType
+from langchain.prompts import PromptTemplate
+from langchain.chat_models import ChatOpenAI
+# from src.srt_util.srt import SrtScript
+# Load the evaluator
+def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
+    # map the language code to the language name
+    language_map = {
+        "en": "English",
+        "zh": "Chinese",
+    }
+    llm = ChatOpenAI(temperature=0, model=model)
+    # Completeness is the percentage of the input that is translated
+    # Accuracy is the percentage of the translation that is correct
+    fstring = """
+            You are grading the translation based on following input:
+            {input}
+            if the input is "", that means there is no input sentence.
+            you should grade the translation based on the reference translation:
+            Here is the real answer(reference):
+            {reference}
+            You are grading the following translation:
+            {output}
+            based on the following criteria:
+            {criteria}
+            Give two grades, accuracy and completeness rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy/completeness) and 100 is the highest (very high accuracy/completeness)?
+            Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
+            numerically incorrect this also includes values that have the $ in front
+            Please give the completeness score first followed by the accuracy score.
+            For example:
+            Accuracy: 40. Explanation here
+            Completeness: 80. Explanation here
+            Do not differ from the format ever
+            """
+    if source_lang in language_map and target_lang in language_map:
+        lang_str = f"You are an expert {language_map[source_lang]} to {language_map[target_lang]} translator specialized in {domain}."
+        prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
+    else:
+        print("The language code is not supported, please check the language code.")
+        prompt = PromptTemplate.from_template(fstring, template_format="f-string")
+    return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
+# prase the output of the evaluation
+# example :
+# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
+# def parse_eval_result(eval_result):
+#     # score = eval_result.score
+#     value = eval_result["value"]
+#     value = value.split("Accuracy: ")[1].split(".")
+#     # combine the rest of the string into the whole explanation
+#     explanation = ".".join(value[1:])
+#     return int(value[0]), explanation
+# def parse_eval_result(eval_result):
+#     # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
+#     accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['value'])
+#     print(accuracy_match)
+#     if accuracy_match:
+#         accuracy = int(accuracy_match.group(1))
+#     else:
+#         # try to get the accuracy from the 'value' key
+#         accuracy = 0
+#     # Directly get the 'Explanation' value from the 'value' key
+#     explanation = eval_result['value']
+#     return accuracy, explanation
+def parse_eval_result(data):
+    # Extract the value string
+    value_str = data.get('value', '')
+    reasoning_str = data.get('reasoning', '')
+    # Use regex to extract accuracy value and explanation
+    accuracy_match = re.search(r'Accuracy: (\d+)', value_str)
+    acc_explanation_match = re.search(r'Accuracy: \d+\. (.+)', value_str)
+    # Use regex to extract completeness value and explanation
+    completeness_match = re.search(r'Completeness: (\d+)', reasoning_str)
+    completeness_explanation_match = re.search(r'Completeness: \d+\. (.+)', reasoning_str)
+    # Extract the matched groups
+    completeness = int(completeness_match.group(1)) if completeness_match else None
+    completeness_explanation = completeness_explanation_match.group(1) if completeness_explanation_match else None
+    accuracy = int(accuracy_match.group(1)) if accuracy_match else None
+    acc_explanation = acc_explanation_match.group(1) if acc_explanation_match else None
+    return (accuracy, acc_explanation), (completeness, completeness_explanation)
+def evaluate_prediction(input, reference, prediction, evaluator):
+    eval_result = evaluator.evaluate_strings(
+        prediction=prediction,
+        input=input,
+        reference=reference,
+    )
+    # print(eval_result)
+    return parse_eval_result(eval_result)
+if __name__ == "__main__":
+    evaluator = init_evaluator()
+    # For no input english sentence, just put "" in the input
+    accuracy, completeness = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
+    print("Accuracy:", accuracy[0])
+    print("Acc_Explanation:", accuracy[1])
+    print("Completeness:", completeness[0])
+    print("Comp_Explanation:", completeness[1])

evaluation/scores/__init__.py ADDED Viewed

File without changes

evaluation/scores/multi_scores.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from comet import download_model, load_from_checkpoint
+from sacrebleu.metrics import BLEU, CHRF, TER
+from scores import LLM_eval
+# import LLM_eval
+class multi_scores:
+    def __init__(self, source_lang="en", target_lang="zh", domain="starcraft 2") -> None:
+        self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
+        self.bleu_model = BLEU(tokenize=target_lang)
+        self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
+        # self.score = {}
+    def __preprocess(self, src:str, mt:str, ref:str) -> dict:
+        # remove the space in the beginning and end of the sentence\
+        src = src.strip()
+        mt = mt.strip()
+        ref = ref.strip()
+        print(src, mt, ref)
+        return {'src':src, 'mt':mt, 'ref':ref}
+    # The function to get the scores
+    # src: orginal sentence
+    # mt: machine translation
+    # ref: reference translation
+    def calculate_comet_llm(self, src:str, mt:str, ref:str) -> dict:
+        # preprocess the input
+        src, mt, ref = self.__preprocess(src, mt, ref)
+        comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
+        # bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
+        llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
+        return {'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
+        # self.score['bleu_score'] = bleu_score
+        # self.score['comet_score'] = comet_score
+        # self.score['llm_score'] = llm_score
+        # self.score['llm_explanation'] = llm_explanation
+    def calculate_bleu(self, mts:list, refs:list) -> dict:
+        # src, mt, ref = self.__preprocess(src, mt, ref)
+        # remove the space in the beginning and end of the sentence for each sentence
+        # mts = [mt.strip() for mt in mts]
+        # refs = [ref.strip() for ref in refs]
+        # print(mts, refs)
+        # mt and ref are list of sentences
+        bleu_score = self.bleu_model.corpus_score(mts, refs).score
+        return {'bleu_score':bleu_score}
+    def get_scores(self, src:str, mt:str, ref:str) -> dict:
+        comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
+        bleu_score = self.bleu_model.corpus_score([mt], [[ref]]).score
+        llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
+        return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
+if __name__ == "__main__":
+    src = "South Korea playing with the Blue Proto's Probes"
+    mt = "位于对角线的另一个角落  使用蓝色的Proto's Probes"
+    ref = " 在对角落里使用蓝色神族探机 他的名字是..."
+    # print(multi_scores().get_scores(src, mt, ref))
+    # print(multi_scores().calculate_comet_llm(src, mt, ref))
+    print(multi_scores().calculate_bleu([mt], [[ref]]))

evaluation/scores/score.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from comet import download_model, load_from_checkpoint
+from sacrebleu.metrics import BLEU, CHRF, TER
+def COMETscore(src, mt, ref):
+    data = []
+    for i in enumerate(src):
+        data.append({"src":src[i], "mt":mt[i], "ref":ref[i]})
+    model_path = download_model("Unbabel/wmt22-comet-da")
+    model = load_from_checkpoint(model_path)
+    model_output = model.predict(data, batch_size = 8, gpus=0)
+    return model_output
+def BLEUscore(sys, refs):
+    bleu = BLEU()
+    return bleu.corpus_score(sys, refs)

requirement.txt CHANGED Viewed

@@ -38,3 +38,5 @@ tqdm==4.65.0
 typing_extensions==4.5.0
 urllib3==1.26.15
 yarl==1.8.2

 typing_extensions==4.5.0
 urllib3==1.26.15
 yarl==1.8.2
+sacrebleu==2.3.1
+unbabel-comet==2.1.0

src/srt_util/srt.py CHANGED Viewed

@@ -50,7 +50,10 @@ class SrtSegment(object):
             self.start = int(start_list[0]) * 3600 + int(start_list[1]) * 60 + int(start_list[2]) + self.start_ms / 100
             end_list = self.end_time_str.split(',')[0].split(':')
             self.end = int(end_list[0]) * 3600 + int(end_list[1]) * 60 + int(end_list[2]) + self.end_ms / 100
-            self.translation = ""
     def merge_seg(self, seg):
         """
@@ -105,10 +108,16 @@ class SrtScript(object):
     def parse_from_srt_file(cls, path: str):
         with open(path, 'r', encoding="utf-8") as f:
             script_lines = [line.rstrip() for line in f.readlines()]
         segments = []
-        for i in range(0, len(script_lines), 4):
-            segments.append(list(script_lines[i:i + 4]))
         return cls(segments)

             self.start = int(start_list[0]) * 3600 + int(start_list[1]) * 60 + int(start_list[2]) + self.start_ms / 100
             end_list = self.end_time_str.split(',')[0].split(':')
             self.end = int(end_list[0]) * 3600 + int(end_list[1]) * 60 + int(end_list[2]) + self.end_ms / 100
+            if len(args[0]) < 5:
+                self.translation = ""
+            else:
+                self.translation = args[0][3]
     def merge_seg(self, seg):
         """
     def parse_from_srt_file(cls, path: str):
         with open(path, 'r', encoding="utf-8") as f:
             script_lines = [line.rstrip() for line in f.readlines()]
+        bilingual = False
+        if script_lines[2] != '' and script_lines[3] != '':
+            bilingual = True
         segments = []
+        if bilingual:
+            for i in range(0, len(script_lines), 5):
+                segments.append(list(script_lines[i:i + 5]))
+        else:
+            for i in range(0, len(script_lines), 4):
+                segments.append(list(script_lines[i:i + 4]))
         return cls(segments)