import json import re from pathlib import Path from anthropic import Anthropic from glossary_checker import GlossaryChecker class TranslationValidator: def __init__(self, glossary_checker, anthropic_api_key): """Initialize validator with glossary checker and API key.""" self.checker = glossary_checker self.client = Anthropic(api_key=anthropic_api_key) def analyze_terms(self, source_text, target_text, found_terms): """Analyze terms using Claude to assess their usage and translation in context.""" if not found_terms: return [] prompt = f"""Analyze each term found in this Tibetan text and its translation: Tibetan text: {source_text} English translation: {target_text} For each term, I'll provide: - The term - Expected translations from glossary Please analyze:""" # Add term details to prompt for term in found_terms: prompt += f"\n\nTerm: {term['source_term']}" for cat_name, cat_data in term['categories'].items(): prompt += f"\nCategory '{cat_name}':" prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}" if 'definitions' in cat_data: prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}" prompt += """\n For each term, provide analysis in JSON format: [{ "term": "term1", "analysis": { "translated_as": "how it appears in the target translation", "glossary_translation": "how it should be translated according to the glossary", "matching_categories": ["category1", "category2"], "translation_assessment": { "translated_correctly": true/false, "should_be_counted": true/false } } }] Key points for analysis: 2. should_be_counted: true if the term's usage matches any of the glossary definitions 3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions 4. Consider both the definitions and provided translations when analyzing the term's usage 5. translated_correctly: true if the term matches the glossary definition with these specific conditions: 5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term: - NOT correct, even if semantically equivalent Example: - ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect - རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect 5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations: - Correct if only differs in: * Singular/plural forms (sugata/sugatas) * Case variations (buddha/buddha's) * Common derived forms (dharma/dharmic) Example: - བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct - སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct 5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents.""" try: message = self.client.messages.create( model="claude-3-sonnet-20240229", max_tokens=2000, messages=[{"role": "user", "content": prompt}], ) json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL) if not json_match: return [] analysis = json.loads(json_match.group()) # Add analysis to each term analyzed_terms = [] for term in found_terms: for item in analysis: if item["term"] == term["source_term"]: # Preserve original term data and add analysis analyzed_term = { "source_term": term["source_term"], "categories": {}, # Keep original categories "analysis": item["analysis"] } # Only include matching categories for cat_name, cat_data in term["categories"].items(): if cat_name in item["analysis"]["matching_categories"]: analyzed_term["categories"][cat_name] = cat_data analyzed_terms.append(analyzed_term) break return analyzed_terms except (json.JSONDecodeError, KeyError) as e: print(f"Error parsing LLM response: {e}") return [] def calculate_translation_score(self, found_terms): """Calculate translation score based on correct translations.""" if not found_terms: return 0.0 total_countable_terms = 0 correctly_translated = 0 for term in found_terms: analysis = term["analysis"] assessment = analysis["translation_assessment"] # Only count terms that should be counted and match glossary if assessment["should_be_counted"]: total_countable_terms += 1 if assessment["translated_correctly"]: correctly_translated += 1 return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0 def validate_translation(self, aligned_file_path): """Process aligned file and validate translations.""" aligned_pairs = self.load_aligned_file(aligned_file_path) results = [] for line_num, (source, target) in enumerate(aligned_pairs, 1): # Check against glossary check_results = self.checker.check(source, target) # Analyze terms analyzed_terms = self.analyze_terms(source, target, check_results) # Calculate score score = self.calculate_translation_score(analyzed_terms) results.append({ "line_number": line_num, "source": source, "target": target, "terms": analyzed_terms, "score": score, }) return results def load_aligned_file(self, file_path): """Load tab-separated source and target segments.""" aligned_pairs = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue parts = line.split("\t") if len(parts) != 2: print(f"Warning: Skipping malformed line: {line}") continue source, target = parts aligned_pairs.append((source.strip(), target.strip())) return aligned_pairs def save_results(self, results, output_path): """Save validation results to JSON file.""" with open(output_path, "w", encoding="utf-8") as f: json.dump( { "summary": { "total_lines": len(results), "average_score": ( sum(r["score"] for r in results) / len(results) if results else 0 ), }, "lines": results, }, f, ensure_ascii=False, indent=2, ) # Example usage: if __name__ == "__main__": import os data_path = Path(__file__).parent / "data" # Initialize components glossary_path = data_path / "84000_glossary.json" checker = GlossaryChecker(glossary_path) validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY")) # Process aligned file aligned_file = data_path / "example_translations.txt" results = validator.validate_translation(aligned_file) # Save results validator.save_results(results, data_path / "validation_results.json") print("Validation completed. Results saved to 'data/validation_results.json'.")