Spaces:

openpecha
/

translation_glossary_checker

Running

File size: 8,350 Bytes
import json
import re
from pathlib import Path

from anthropic import Anthropic

from glossary_checker import GlossaryChecker


class TranslationValidator:
    def __init__(self, glossary_checker, anthropic_api_key):
        """Initialize validator with glossary checker and API key."""
        self.checker = glossary_checker
        self.client = Anthropic(api_key=anthropic_api_key)

    def analyze_terms(self, source_text, target_text, found_terms):
        """Analyze terms using Claude to assess their usage and translation in context."""
        if not found_terms:
            return []

        prompt = f"""Analyze each term found in this Tibetan text and its translation:

Tibetan text: {source_text}
English translation: {target_text}

For each term, I'll provide:
- The term
- Expected translations from glossary

Please analyze:"""

        # Add term details to prompt
        for term in found_terms:
            prompt += f"\n\nTerm: {term['source_term']}"
            for cat_name, cat_data in term['categories'].items():
                prompt += f"\nCategory '{cat_name}':"
                prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}"
                if 'definitions' in cat_data:
                    prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"

        prompt += """\n
For each term, provide analysis in JSON format:
[{
  "term": "term1",
  "analysis": {
    "translated_as": "how it appears in the target translation",
    "glossary_translation": "how it should be translated according to the glossary",
    "matching_categories": ["category1", "category2"],
    "translation_assessment": {
      "translated_correctly": true/false,
      "should_be_counted": true/false
    }
  }
}]

Key points for analysis:
2. should_be_counted: true if the term's usage matches any of the glossary definitions
3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions
4. Consider both the definitions and provided translations when analyzing the term's usage
5. translated_correctly: true if the term matches the glossary definition with these specific conditions:
    5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term:
        - NOT correct, even if semantically equivalent
        Example:
        - ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect
        - རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect

    5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations:
        - Correct if only differs in:
            * Singular/plural forms (sugata/sugatas)
            * Case variations (buddha/buddha's)
            * Common derived forms (dharma/dharmic)
        Example:
        - བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct
        - སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct

    5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents."""

        try:
            message = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=2000,
                messages=[{"role": "user", "content": prompt}],
            )

            json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL)
            if not json_match:
                return []

            analysis = json.loads(json_match.group())

            # Add analysis to each term
            analyzed_terms = []
            for term in found_terms:
                for item in analysis:
                    if item["term"] == term["source_term"]:
                        # Preserve original term data and add analysis
                        analyzed_term = {
                            "source_term": term["source_term"],
                            "categories": {},  # Keep original categories
                            "analysis": item["analysis"]
                        }

                         # Only include matching categories
                        for cat_name, cat_data in term["categories"].items():
                            if cat_name in item["analysis"]["matching_categories"]:
                                analyzed_term["categories"][cat_name] = cat_data

                        analyzed_terms.append(analyzed_term)
                        break

            return analyzed_terms

        except (json.JSONDecodeError, KeyError) as e:
            print(f"Error parsing LLM response: {e}")
            return []

    def calculate_translation_score(self, found_terms):
        """Calculate translation score based on correct translations."""
        if not found_terms:
            return 0.0

        total_countable_terms = 0
        correctly_translated = 0

        for term in found_terms:
            analysis = term["analysis"]
            assessment = analysis["translation_assessment"]

            # Only count terms that should be counted and match glossary
            if assessment["should_be_counted"]:
                total_countable_terms += 1
                if assessment["translated_correctly"]:
                    correctly_translated += 1

        return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0

    def validate_translation(self, aligned_file_path):
        """Process aligned file and validate translations."""
        aligned_pairs = self.load_aligned_file(aligned_file_path)

        results = []
        for line_num, (source, target) in enumerate(aligned_pairs, 1):
            # Check against glossary
            check_results = self.checker.check(source, target)

            # Analyze terms
            analyzed_terms = self.analyze_terms(source, target, check_results)

            # Calculate score
            score = self.calculate_translation_score(analyzed_terms)

            results.append({
                "line_number": line_num,
                "source": source,
                "target": target,
                "terms": analyzed_terms,
                "score": score,
            })

        return results

    def load_aligned_file(self, file_path):
        """Load tab-separated source and target segments."""
        aligned_pairs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                parts = line.split("\t")
                if len(parts) != 2:
                    print(f"Warning: Skipping malformed line: {line}")
                    continue

                source, target = parts
                aligned_pairs.append((source.strip(), target.strip()))

        return aligned_pairs

    def save_results(self, results, output_path):
        """Save validation results to JSON file."""
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(
                {
                    "summary": {
                        "total_lines": len(results),
                        "average_score": (
                            sum(r["score"] for r in results) / len(results)
                            if results
                            else 0
                        ),
                    },
                    "lines": results,
                },
                f,
                ensure_ascii=False,
                indent=2,
            )


# Example usage:
if __name__ == "__main__":
    import os

    data_path = Path(__file__).parent / "data"

    # Initialize components
    glossary_path = data_path / "84000_glossary.json"
    checker = GlossaryChecker(glossary_path)
    validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY"))

    # Process aligned file
    aligned_file = data_path / "example_translations.txt"
    results = validator.validate_translation(aligned_file)

    # Save results
    validator.save_results(results, data_path / "validation_results.json")

    print("Validation completed. Results saved to 'data/validation_results.json'.")