Spaces:

openpecha
/

translation_glossary_checker

Running

translation_glossary_checker / llm_post_editor.py

test

add llm based post editor

2ba7d76 11 days ago

7.8 kB

	import json
	from typing import Any, Dict, List

	from anthropic import Anthropic


	class LLMTranslationEditor:
	def __init__(self, validation_results: dict, anthropic_api_key: str):
	"""Initialize with validation results and Anthropic API key.

	Args:
	validation_results (dict): Results from TranslationValidator
	anthropic_api_key (str): Anthropic API key for Claude access
	"""
	self.results = validation_results
	self.client = Anthropic(api_key=anthropic_api_key)

	def edit_translation(self, source_text: str, current_translation: str,
	terms_info: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""Use Claude to edit the translation considering validation results and context.

	Args:
	source_text (str): Original Tibetan text
	current_translation (str): Current English translation
	terms_info (list): Terms information from validation results

	Returns:
	Dict[str, Any]: Edited translation with analysis
	"""
	# Build context for terms that need attention
	terms_context = []
	for term in terms_info:
	analysis = term['analysis']
	assessment = analysis['translation_assessment']

	if assessment['should_be_counted'] and not assessment['translated_correctly']:
	term_context = {
	'term': term['source_term'],
	'current': analysis['translated_as'],
	'suggested': analysis['glossary_translation'],
	'categories': {}
	}

	# Add category information
	for cat_name, cat_data in term['categories'].items():
	if cat_name in analysis['matching_categories']:
	term_context['categories'][cat_name] = {
	'translations': cat_data.get('translations', []),
	'definitions': cat_data.get('definitions', [])
	}

	terms_context.append(term_context)

	if not terms_context:
	return {
	'edited_translation': current_translation,
	'modified': False,
	'reasoning': 'No terms requiring editing'
	}

	prompt = f"""You are an expert Tibetan translator. Review and improve this translation, focusing on accuracy and natural English:

	Tibetan text: {source_text}
	Current translation: {current_translation}

	The following terms need attention:"""

	for term in terms_context:
	prompt += f"\n\nTibetan term: {term['term']}"
	prompt += f"\nCurrently translated as: {term['current']}"
	prompt += f"\nGlossary suggestion: {term['suggested']}"

	for cat_name, cat_data in term['categories'].items():
	prompt += f"\n{cat_name}:"
	if cat_data['definitions']:
	prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"
	if cat_data['translations']:
	prompt += f"\n- Translations: {', '.join(cat_data['translations'])}"

	prompt += """

	Please provide:
	1. An improved translation that:
	- Maintains the meaning of the Tibetan text
	- Maintains the style and tone of the current translation
	- Uses appropriate technical terms from the glossary
	- Preserves any correct parts of the current translation
	2. Your reasoning for the changes

	Respond in JSON format:
	{
	"edited_translation": "your improved translation",
	"reasoning": "explanation of changes and decisions",
	"modified": true/false
	}"""

	try:
	message = self.client.messages.create(
	model="claude-3-sonnet-20240229",
	max_tokens=1000,
	temperature=0,
	messages=[{"role": "user", "content": prompt}]
	)

	# Extract JSON from response
	import re
	json_match = re.search(r'\{.*\}', message.content[0].text, re.DOTALL)
	if json_match:
	return json.loads(json_match.group())
	else:
	return {
	'edited_translation': current_translation,
	'modified': False,
	'reasoning': 'Failed to parse LLM response'
	}

	except Exception as e:
	print(f"Error during LLM editing: {e}")
	return {
	'edited_translation': current_translation,
	'modified': False,
	'reasoning': f'LLM editing failed: {str(e)}'
	}

	def post_edit_translations(self) -> List[Dict[str, Any]]:
	"""Process all lines and post-edit translations using LLM.

	Returns:
	List[Dict[str, Any]]: List of edited translations with analysis
	"""
	edited_translations = []

	for line in self.results['lines']:
	source = line['source']
	target = line['target']
	terms = line['terms']

	if not terms:
	edited_translations.append({
	'line_number': line['line_number'],
	'source': source,
	'original': target,
	'edited': target,
	'modified': False,
	'reasoning': 'No terms to edit'
	})
	continue

	# Get LLM to edit the translation
	edit_result = self.edit_translation(source, target, terms)

	edited_translations.append({
	'line_number': line['line_number'],
	'source': source,
	'original': target,
	'edited': edit_result['edited_translation'],
	'modified': edit_result['modified'],
	'reasoning': edit_result['reasoning']
	})

	return edited_translations

	def save_edits(self, edited_translations: List[Dict[str, Any]],
	output_path: str) -> None:
	"""Save the post-edited translations with analysis to a file.

	Args:
	edited_translations (List[Dict[str, Any]]): Edited translations with analysis
	output_path (str): Path to save results
	"""
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump({
	'summary': {
	'total_lines': len(edited_translations),
	'modified_lines': sum(1 for t in edited_translations if t['modified'])
	},
	'translations': edited_translations
	}, f, ensure_ascii=False, indent=2)


	# Example usage:
	if __name__ == "__main__":
	import os

	# Load validation results
	with open('data/validation_results.json', 'r', encoding='utf-8') as f:
	validation_results = json.load(f)

	# Create editor and process translations
	editor = LLMTranslationEditor(
	validation_results,
	os.getenv('ANTHROPIC_API_KEY')
	)
	edited_translations = editor.post_edit_translations()

	# Save results
	editor.save_edits(edited_translations, 'llm_post_edited_translations.json')

	# Print summary and examples
	print(f"Post-editing completed:")
	print(f"Total lines: {len(edited_translations)}")
	print(f"Modified lines: {sum(1 for t in edited_translations if t['modified'])}")

	print("\nExample modifications:")
	for trans in edited_translations:
	if trans['modified']:
	print(f"\nLine {trans['line_number']}:")
	print(f"Source : {trans['source']}")
	print(f"Original: {trans['original']}")
	print(f"Edited : {trans['edited']}")
	print(f"Reasoning: {trans['reasoning']}")