Spaces:

openpecha
/

translation_glossary_checker

Running

File size: 7,802 Bytes

2ba7d76

import json
from typing import Any, Dict, List

from anthropic import Anthropic


class LLMTranslationEditor:
    def __init__(self, validation_results: dict, anthropic_api_key: str):
        """Initialize with validation results and Anthropic API key.

        Args:
            validation_results (dict): Results from TranslationValidator
            anthropic_api_key (str): Anthropic API key for Claude access
        """
        self.results = validation_results
        self.client = Anthropic(api_key=anthropic_api_key)

    def edit_translation(self, source_text: str, current_translation: str,
                        terms_info: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Use Claude to edit the translation considering validation results and context.

        Args:
            source_text (str): Original Tibetan text
            current_translation (str): Current English translation
            terms_info (list): Terms information from validation results

        Returns:
            Dict[str, Any]: Edited translation with analysis
        """
        # Build context for terms that need attention
        terms_context = []
        for term in terms_info:
            analysis = term['analysis']
            assessment = analysis['translation_assessment']

            if assessment['should_be_counted'] and not assessment['translated_correctly']:
                term_context = {
                    'term': term['source_term'],
                    'current': analysis['translated_as'],
                    'suggested': analysis['glossary_translation'],
                    'categories': {}
                }

                # Add category information
                for cat_name, cat_data in term['categories'].items():
                    if cat_name in analysis['matching_categories']:
                        term_context['categories'][cat_name] = {
                            'translations': cat_data.get('translations', []),
                            'definitions': cat_data.get('definitions', [])
                        }

                terms_context.append(term_context)

        if not terms_context:
            return {
                'edited_translation': current_translation,
                'modified': False,
                'reasoning': 'No terms requiring editing'
            }

        prompt = f"""You are an expert Tibetan translator. Review and improve this translation, focusing on accuracy and natural English:

Tibetan text: {source_text}
Current translation: {current_translation}

The following terms need attention:"""

        for term in terms_context:
            prompt += f"\n\nTibetan term: {term['term']}"
            prompt += f"\nCurrently translated as: {term['current']}"
            prompt += f"\nGlossary suggestion: {term['suggested']}"

            for cat_name, cat_data in term['categories'].items():
                prompt += f"\n{cat_name}:"
                if cat_data['definitions']:
                    prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"
                if cat_data['translations']:
                    prompt += f"\n- Translations: {', '.join(cat_data['translations'])}"

        prompt += """

Please provide:
1. An improved translation that:
   - Maintains the meaning of the Tibetan text
   - Maintains the style and tone of the current translation
   - Uses appropriate technical terms from the glossary
   - Preserves any correct parts of the current translation
2. Your reasoning for the changes

Respond in JSON format:
{
  "edited_translation": "your improved translation",
  "reasoning": "explanation of changes and decisions",
  "modified": true/false
}"""

        try:
            message = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1000,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )

            # Extract JSON from response
            import re
            json_match = re.search(r'\{.*\}', message.content[0].text, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
            else:
                return {
                    'edited_translation': current_translation,
                    'modified': False,
                    'reasoning': 'Failed to parse LLM response'
                }

        except Exception as e:
            print(f"Error during LLM editing: {e}")
            return {
                'edited_translation': current_translation,
                'modified': False,
                'reasoning': f'LLM editing failed: {str(e)}'
            }

    def post_edit_translations(self) -> List[Dict[str, Any]]:
        """Process all lines and post-edit translations using LLM.

        Returns:
            List[Dict[str, Any]]: List of edited translations with analysis
        """
        edited_translations = []

        for line in self.results['lines']:
            source = line['source']
            target = line['target']
            terms = line['terms']

            if not terms:
                edited_translations.append({
                    'line_number': line['line_number'],
                    'source': source,
                    'original': target,
                    'edited': target,
                    'modified': False,
                    'reasoning': 'No terms to edit'
                })
                continue

            # Get LLM to edit the translation
            edit_result = self.edit_translation(source, target, terms)

            edited_translations.append({
                'line_number': line['line_number'],
                'source': source,
                'original': target,
                'edited': edit_result['edited_translation'],
                'modified': edit_result['modified'],
                'reasoning': edit_result['reasoning']
            })

        return edited_translations

    def save_edits(self, edited_translations: List[Dict[str, Any]],
                  output_path: str) -> None:
        """Save the post-edited translations with analysis to a file.

        Args:
            edited_translations (List[Dict[str, Any]]): Edited translations with analysis
            output_path (str): Path to save results
        """
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump({
                'summary': {
                    'total_lines': len(edited_translations),
                    'modified_lines': sum(1 for t in edited_translations if t['modified'])
                },
                'translations': edited_translations
            }, f, ensure_ascii=False, indent=2)


# Example usage:
if __name__ == "__main__":
    import os

    # Load validation results
    with open('data/validation_results.json', 'r', encoding='utf-8') as f:
        validation_results = json.load(f)

    # Create editor and process translations
    editor = LLMTranslationEditor(
        validation_results,
        os.getenv('ANTHROPIC_API_KEY')
    )
    edited_translations = editor.post_edit_translations()

    # Save results
    editor.save_edits(edited_translations, 'llm_post_edited_translations.json')

    # Print summary and examples
    print(f"Post-editing completed:")
    print(f"Total lines: {len(edited_translations)}")
    print(f"Modified lines: {sum(1 for t in edited_translations if t['modified'])}")

    print("\nExample modifications:")
    for trans in edited_translations:
        if trans['modified']:
            print(f"\nLine {trans['line_number']}:")
            print(f"Source  : {trans['source']}")
            print(f"Original: {trans['original']}")
            print(f"Edited  : {trans['edited']}")
            print(f"Reasoning: {trans['reasoning']}")