JiaenLiu commited on
Commit
fb908a6
·
1 Parent(s): 6723c13

Former-commit-id: 713a1c3babc77b7865e7aae1fbe0edd72ffaf381

evaluation/scores/LLM_eval.py CHANGED
@@ -5,6 +5,7 @@
5
 
6
  # Import the necessary packages
7
  import re
 
8
  from langchain.evaluation import load_evaluator, EvaluatorType
9
  from langchain.prompts import PromptTemplate
10
  from langchain.chat_models import ChatOpenAI
@@ -22,7 +23,7 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
22
 
23
  llm = ChatOpenAI(temperature=0, model=model)
24
 
25
- # Completeness is the percentage of the input that is translated
26
  # Accuracy is the percentage of the translation that is correct
27
  fstring = """
28
  You are grading the translation based on following input:
@@ -83,16 +84,18 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
83
 
84
  def parse_eval_result(data):
85
  # Extract the value string
86
- value_str = data.get('value', '')
87
- reasoning_str = data.get('reasoning', '')
 
 
88
 
89
  # Use regex to extract accuracy value and explanation
90
- accuracy_match = re.search(r'Accuracy: (\d+)', value_str)
91
- acc_explanation_match = re.search(r'Accuracy: \d+\. (.+)', value_str)
92
 
93
  # Use regex to extract completeness value and explanation
94
- completeness_match = re.search(r'Completeness: (\d+)', reasoning_str)
95
- completeness_explanation_match = re.search(r'Completeness: \d+\. (.+)', reasoning_str)
96
 
97
  # Extract the matched groups
98
  completeness = int(completeness_match.group(1)) if completeness_match else None
@@ -108,13 +111,13 @@ def evaluate_prediction(input, reference, prediction, evaluator):
108
  input=input,
109
  reference=reference,
110
  )
111
- # print(eval_result)
112
  return parse_eval_result(eval_result)
113
 
114
  if __name__ == "__main__":
115
  evaluator = init_evaluator()
116
  # For no input english sentence, just put "" in the input
117
- accuracy, completeness = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
118
  print("Accuracy:", accuracy[0])
119
  print("Acc_Explanation:", accuracy[1])
120
  print("Completeness:", completeness[0])
 
5
 
6
  # Import the necessary packages
7
  import re
8
+
9
  from langchain.evaluation import load_evaluator, EvaluatorType
10
  from langchain.prompts import PromptTemplate
11
  from langchain.chat_models import ChatOpenAI
 
23
 
24
  llm = ChatOpenAI(temperature=0, model=model)
25
 
26
+ # Completeness is the percentage of the input that is translated, to test if there is any missing information
27
  # Accuracy is the percentage of the translation that is correct
28
  fstring = """
29
  You are grading the translation based on following input:
 
84
 
85
  def parse_eval_result(data):
86
  # Extract the value string
87
+ value_str = data.get('value', '').lower()
88
+ reasoning_str = data.get('reasoning', '').lower()
89
+
90
+ response = value_str + reasoning_str
91
 
92
  # Use regex to extract accuracy value and explanation
93
+ accuracy_match = re.search(r'accuracy: (\d+)', response)
94
+ acc_explanation_match = re.search(r'accuracy: \d+\. (.+)', response)
95
 
96
  # Use regex to extract completeness value and explanation
97
+ completeness_match = re.search(r'completeness: (\d+)', response)
98
+ completeness_explanation_match = re.search(r'completeness: \d+\. (.+)', response)
99
 
100
  # Extract the matched groups
101
  completeness = int(completeness_match.group(1)) if completeness_match else None
 
111
  input=input,
112
  reference=reference,
113
  )
114
+ print(eval_result)
115
  return parse_eval_result(eval_result)
116
 
117
  if __name__ == "__main__":
118
  evaluator = init_evaluator()
119
  # For no input english sentence, just put "" in the input
120
+ accuracy, completeness = evaluate_prediction("it's obviously going to be 神族 trying to go for a 野炮台", " 每当我们看到BF开", " 每当我们看到BF开", evaluator)
121
  print("Accuracy:", accuracy[0])
122
  print("Acc_Explanation:", accuracy[1])
123
  print("Completeness:", completeness[0])
evaluation/scores/multi_scores.py CHANGED
@@ -50,7 +50,7 @@ class multi_scores:
50
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
51
  bleu_score = self.bleu_model.corpus_score([mt], [[ref]]).score
52
  llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
53
- return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
54
 
55
 
56
  if __name__ == "__main__":
 
50
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
51
  bleu_score = self.bleu_model.corpus_score([mt], [[ref]]).score
52
  llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
53
+ return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1], 'llm_completeness':llm_completeness[0], 'llm_completeness_explanation':llm_completeness[1]}
54
 
55
 
56
  if __name__ == "__main__":