|
import json |
|
from typing import Any, Dict, List |
|
|
|
from anthropic import Anthropic |
|
|
|
|
|
class LLMTranslationEditor: |
|
def __init__(self, validation_results: dict, anthropic_api_key: str): |
|
"""Initialize with validation results and Anthropic API key. |
|
|
|
Args: |
|
validation_results (dict): Results from TranslationValidator |
|
anthropic_api_key (str): Anthropic API key for Claude access |
|
""" |
|
self.results = validation_results |
|
self.client = Anthropic(api_key=anthropic_api_key) |
|
|
|
def edit_translation(self, source_text: str, current_translation: str, |
|
terms_info: List[Dict[str, Any]]) -> Dict[str, Any]: |
|
"""Use Claude to edit the translation considering validation results and context. |
|
|
|
Args: |
|
source_text (str): Original Tibetan text |
|
current_translation (str): Current English translation |
|
terms_info (list): Terms information from validation results |
|
|
|
Returns: |
|
Dict[str, Any]: Edited translation with analysis |
|
""" |
|
|
|
terms_context = [] |
|
for term in terms_info: |
|
analysis = term['analysis'] |
|
assessment = analysis['translation_assessment'] |
|
|
|
if assessment['should_be_counted'] and not assessment['translated_correctly']: |
|
term_context = { |
|
'term': term['source_term'], |
|
'current': analysis['translated_as'], |
|
'suggested': analysis['glossary_translation'], |
|
'categories': {} |
|
} |
|
|
|
|
|
for cat_name, cat_data in term['categories'].items(): |
|
if cat_name in analysis['matching_categories']: |
|
term_context['categories'][cat_name] = { |
|
'translations': cat_data.get('translations', []), |
|
'definitions': cat_data.get('definitions', []) |
|
} |
|
|
|
terms_context.append(term_context) |
|
|
|
if not terms_context: |
|
return { |
|
'edited_translation': current_translation, |
|
'modified': False, |
|
'reasoning': 'No terms requiring editing' |
|
} |
|
|
|
prompt = f"""You are an expert Tibetan translator. Review and improve this translation, focusing on accuracy and natural English: |
|
|
|
Tibetan text: {source_text} |
|
Current translation: {current_translation} |
|
|
|
The following terms need attention:""" |
|
|
|
for term in terms_context: |
|
prompt += f"\n\nTibetan term: {term['term']}" |
|
prompt += f"\nCurrently translated as: {term['current']}" |
|
prompt += f"\nGlossary suggestion: {term['suggested']}" |
|
|
|
for cat_name, cat_data in term['categories'].items(): |
|
prompt += f"\n{cat_name}:" |
|
if cat_data['definitions']: |
|
prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}" |
|
if cat_data['translations']: |
|
prompt += f"\n- Translations: {', '.join(cat_data['translations'])}" |
|
|
|
prompt += """ |
|
|
|
Please provide: |
|
1. An improved translation that: |
|
- Maintains the meaning of the Tibetan text |
|
- Maintains the style and tone of the current translation |
|
- Uses appropriate technical terms from the glossary |
|
- Preserves any correct parts of the current translation |
|
2. Your reasoning for the changes |
|
|
|
Respond in JSON format: |
|
{ |
|
"edited_translation": "your improved translation", |
|
"reasoning": "explanation of changes and decisions", |
|
"modified": true/false |
|
}""" |
|
|
|
try: |
|
message = self.client.messages.create( |
|
model="claude-3-sonnet-20240229", |
|
max_tokens=1000, |
|
temperature=0, |
|
messages=[{"role": "user", "content": prompt}] |
|
) |
|
|
|
|
|
import re |
|
json_match = re.search(r'\{.*\}', message.content[0].text, re.DOTALL) |
|
if json_match: |
|
return json.loads(json_match.group()) |
|
else: |
|
return { |
|
'edited_translation': current_translation, |
|
'modified': False, |
|
'reasoning': 'Failed to parse LLM response' |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error during LLM editing: {e}") |
|
return { |
|
'edited_translation': current_translation, |
|
'modified': False, |
|
'reasoning': f'LLM editing failed: {str(e)}' |
|
} |
|
|
|
def post_edit_translations(self) -> List[Dict[str, Any]]: |
|
"""Process all lines and post-edit translations using LLM. |
|
|
|
Returns: |
|
List[Dict[str, Any]]: List of edited translations with analysis |
|
""" |
|
edited_translations = [] |
|
|
|
for line in self.results['lines']: |
|
source = line['source'] |
|
target = line['target'] |
|
terms = line['terms'] |
|
|
|
if not terms: |
|
edited_translations.append({ |
|
'line_number': line['line_number'], |
|
'source': source, |
|
'original': target, |
|
'edited': target, |
|
'modified': False, |
|
'reasoning': 'No terms to edit' |
|
}) |
|
continue |
|
|
|
|
|
edit_result = self.edit_translation(source, target, terms) |
|
|
|
edited_translations.append({ |
|
'line_number': line['line_number'], |
|
'source': source, |
|
'original': target, |
|
'edited': edit_result['edited_translation'], |
|
'modified': edit_result['modified'], |
|
'reasoning': edit_result['reasoning'] |
|
}) |
|
|
|
return edited_translations |
|
|
|
def save_edits(self, edited_translations: List[Dict[str, Any]], |
|
output_path: str) -> None: |
|
"""Save the post-edited translations with analysis to a file. |
|
|
|
Args: |
|
edited_translations (List[Dict[str, Any]]): Edited translations with analysis |
|
output_path (str): Path to save results |
|
""" |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
json.dump({ |
|
'summary': { |
|
'total_lines': len(edited_translations), |
|
'modified_lines': sum(1 for t in edited_translations if t['modified']) |
|
}, |
|
'translations': edited_translations |
|
}, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
import os |
|
|
|
|
|
with open('data/validation_results.json', 'r', encoding='utf-8') as f: |
|
validation_results = json.load(f) |
|
|
|
|
|
editor = LLMTranslationEditor( |
|
validation_results, |
|
os.getenv('ANTHROPIC_API_KEY') |
|
) |
|
edited_translations = editor.post_edit_translations() |
|
|
|
|
|
editor.save_edits(edited_translations, 'llm_post_edited_translations.json') |
|
|
|
|
|
print(f"Post-editing completed:") |
|
print(f"Total lines: {len(edited_translations)}") |
|
print(f"Modified lines: {sum(1 for t in edited_translations if t['modified'])}") |
|
|
|
print("\nExample modifications:") |
|
for trans in edited_translations: |
|
if trans['modified']: |
|
print(f"\nLine {trans['line_number']}:") |
|
print(f"Source : {trans['source']}") |
|
print(f"Original: {trans['original']}") |
|
print(f"Edited : {trans['edited']}") |
|
print(f"Reasoning: {trans['reasoning']}") |