translation_glossary_checker / llm_post_editor.py
test
add llm based post editor
2ba7d76
import json
from typing import Any, Dict, List
from anthropic import Anthropic
class LLMTranslationEditor:
def __init__(self, validation_results: dict, anthropic_api_key: str):
"""Initialize with validation results and Anthropic API key.
Args:
validation_results (dict): Results from TranslationValidator
anthropic_api_key (str): Anthropic API key for Claude access
"""
self.results = validation_results
self.client = Anthropic(api_key=anthropic_api_key)
def edit_translation(self, source_text: str, current_translation: str,
terms_info: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Use Claude to edit the translation considering validation results and context.
Args:
source_text (str): Original Tibetan text
current_translation (str): Current English translation
terms_info (list): Terms information from validation results
Returns:
Dict[str, Any]: Edited translation with analysis
"""
# Build context for terms that need attention
terms_context = []
for term in terms_info:
analysis = term['analysis']
assessment = analysis['translation_assessment']
if assessment['should_be_counted'] and not assessment['translated_correctly']:
term_context = {
'term': term['source_term'],
'current': analysis['translated_as'],
'suggested': analysis['glossary_translation'],
'categories': {}
}
# Add category information
for cat_name, cat_data in term['categories'].items():
if cat_name in analysis['matching_categories']:
term_context['categories'][cat_name] = {
'translations': cat_data.get('translations', []),
'definitions': cat_data.get('definitions', [])
}
terms_context.append(term_context)
if not terms_context:
return {
'edited_translation': current_translation,
'modified': False,
'reasoning': 'No terms requiring editing'
}
prompt = f"""You are an expert Tibetan translator. Review and improve this translation, focusing on accuracy and natural English:
Tibetan text: {source_text}
Current translation: {current_translation}
The following terms need attention:"""
for term in terms_context:
prompt += f"\n\nTibetan term: {term['term']}"
prompt += f"\nCurrently translated as: {term['current']}"
prompt += f"\nGlossary suggestion: {term['suggested']}"
for cat_name, cat_data in term['categories'].items():
prompt += f"\n{cat_name}:"
if cat_data['definitions']:
prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"
if cat_data['translations']:
prompt += f"\n- Translations: {', '.join(cat_data['translations'])}"
prompt += """
Please provide:
1. An improved translation that:
- Maintains the meaning of the Tibetan text
- Maintains the style and tone of the current translation
- Uses appropriate technical terms from the glossary
- Preserves any correct parts of the current translation
2. Your reasoning for the changes
Respond in JSON format:
{
"edited_translation": "your improved translation",
"reasoning": "explanation of changes and decisions",
"modified": true/false
}"""
try:
message = self.client.messages.create(
model="claude-3-sonnet-20240229",
max_tokens=1000,
temperature=0,
messages=[{"role": "user", "content": prompt}]
)
# Extract JSON from response
import re
json_match = re.search(r'\{.*\}', message.content[0].text, re.DOTALL)
if json_match:
return json.loads(json_match.group())
else:
return {
'edited_translation': current_translation,
'modified': False,
'reasoning': 'Failed to parse LLM response'
}
except Exception as e:
print(f"Error during LLM editing: {e}")
return {
'edited_translation': current_translation,
'modified': False,
'reasoning': f'LLM editing failed: {str(e)}'
}
def post_edit_translations(self) -> List[Dict[str, Any]]:
"""Process all lines and post-edit translations using LLM.
Returns:
List[Dict[str, Any]]: List of edited translations with analysis
"""
edited_translations = []
for line in self.results['lines']:
source = line['source']
target = line['target']
terms = line['terms']
if not terms:
edited_translations.append({
'line_number': line['line_number'],
'source': source,
'original': target,
'edited': target,
'modified': False,
'reasoning': 'No terms to edit'
})
continue
# Get LLM to edit the translation
edit_result = self.edit_translation(source, target, terms)
edited_translations.append({
'line_number': line['line_number'],
'source': source,
'original': target,
'edited': edit_result['edited_translation'],
'modified': edit_result['modified'],
'reasoning': edit_result['reasoning']
})
return edited_translations
def save_edits(self, edited_translations: List[Dict[str, Any]],
output_path: str) -> None:
"""Save the post-edited translations with analysis to a file.
Args:
edited_translations (List[Dict[str, Any]]): Edited translations with analysis
output_path (str): Path to save results
"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump({
'summary': {
'total_lines': len(edited_translations),
'modified_lines': sum(1 for t in edited_translations if t['modified'])
},
'translations': edited_translations
}, f, ensure_ascii=False, indent=2)
# Example usage:
if __name__ == "__main__":
import os
# Load validation results
with open('data/validation_results.json', 'r', encoding='utf-8') as f:
validation_results = json.load(f)
# Create editor and process translations
editor = LLMTranslationEditor(
validation_results,
os.getenv('ANTHROPIC_API_KEY')
)
edited_translations = editor.post_edit_translations()
# Save results
editor.save_edits(edited_translations, 'llm_post_edited_translations.json')
# Print summary and examples
print(f"Post-editing completed:")
print(f"Total lines: {len(edited_translations)}")
print(f"Modified lines: {sum(1 for t in edited_translations if t['modified'])}")
print("\nExample modifications:")
for trans in edited_translations:
if trans['modified']:
print(f"\nLine {trans['line_number']}:")
print(f"Source : {trans['source']}")
print(f"Original: {trans['original']}")
print(f"Edited : {trans['edited']}")
print(f"Reasoning: {trans['reasoning']}")