|
import json |
|
import re |
|
from pathlib import Path |
|
|
|
from anthropic import Anthropic |
|
|
|
from glossary_checker import GlossaryChecker |
|
|
|
|
|
class TranslationValidator: |
|
def __init__(self, glossary_checker, anthropic_api_key): |
|
"""Initialize validator with glossary checker and API key.""" |
|
self.checker = glossary_checker |
|
self.client = Anthropic(api_key=anthropic_api_key) |
|
|
|
def analyze_terms(self, source_text, target_text, found_terms): |
|
"""Analyze terms using Claude to assess their usage and translation in context.""" |
|
if not found_terms: |
|
return [] |
|
|
|
prompt = f"""Analyze each term found in this Tibetan text and its translation: |
|
|
|
Tibetan text: {source_text} |
|
English translation: {target_text} |
|
|
|
For each term, I'll provide: |
|
- The term |
|
- Expected translations from glossary |
|
|
|
Please analyze:""" |
|
|
|
|
|
for term in found_terms: |
|
prompt += f"\n\nTerm: {term['source_term']}" |
|
for cat_name, cat_data in term['categories'].items(): |
|
prompt += f"\nCategory '{cat_name}':" |
|
prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}" |
|
if 'definitions' in cat_data: |
|
prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}" |
|
|
|
prompt += """\n |
|
For each term, provide analysis in JSON format: |
|
[{ |
|
"term": "term1", |
|
"analysis": { |
|
"translated_as": "how it appears in the target translation", |
|
"glossary_translation": "how it should be translated according to the glossary", |
|
"matching_categories": ["category1", "category2"], |
|
"translation_assessment": { |
|
"translated_correctly": true/false, |
|
"should_be_counted": true/false |
|
} |
|
} |
|
}] |
|
|
|
Key points for analysis: |
|
2. should_be_counted: true if the term's usage matches any of the glossary definitions |
|
3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions |
|
4. Consider both the definitions and provided translations when analyzing the term's usage |
|
5. translated_correctly: true if the term matches the glossary definition with these specific conditions: |
|
5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term: |
|
- NOT correct, even if semantically equivalent |
|
Example: |
|
- ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect |
|
- རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect |
|
|
|
5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations: |
|
- Correct if only differs in: |
|
* Singular/plural forms (sugata/sugatas) |
|
* Case variations (buddha/buddha's) |
|
* Common derived forms (dharma/dharmic) |
|
Example: |
|
- བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct |
|
- སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct |
|
|
|
5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents.""" |
|
|
|
try: |
|
message = self.client.messages.create( |
|
model="claude-3-sonnet-20240229", |
|
max_tokens=2000, |
|
messages=[{"role": "user", "content": prompt}], |
|
) |
|
|
|
json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL) |
|
if not json_match: |
|
return [] |
|
|
|
analysis = json.loads(json_match.group()) |
|
|
|
|
|
analyzed_terms = [] |
|
for term in found_terms: |
|
for item in analysis: |
|
if item["term"] == term["source_term"]: |
|
|
|
analyzed_term = { |
|
"source_term": term["source_term"], |
|
"categories": {}, |
|
"analysis": item["analysis"] |
|
} |
|
|
|
|
|
for cat_name, cat_data in term["categories"].items(): |
|
if cat_name in item["analysis"]["matching_categories"]: |
|
analyzed_term["categories"][cat_name] = cat_data |
|
|
|
analyzed_terms.append(analyzed_term) |
|
break |
|
|
|
return analyzed_terms |
|
|
|
except (json.JSONDecodeError, KeyError) as e: |
|
print(f"Error parsing LLM response: {e}") |
|
return [] |
|
|
|
def calculate_translation_score(self, found_terms): |
|
"""Calculate translation score based on correct translations.""" |
|
if not found_terms: |
|
return 0.0 |
|
|
|
total_countable_terms = 0 |
|
correctly_translated = 0 |
|
|
|
for term in found_terms: |
|
analysis = term["analysis"] |
|
assessment = analysis["translation_assessment"] |
|
|
|
|
|
if assessment["should_be_counted"]: |
|
total_countable_terms += 1 |
|
if assessment["translated_correctly"]: |
|
correctly_translated += 1 |
|
|
|
return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0 |
|
|
|
def validate_translation(self, aligned_file_path): |
|
"""Process aligned file and validate translations.""" |
|
aligned_pairs = self.load_aligned_file(aligned_file_path) |
|
|
|
results = [] |
|
for line_num, (source, target) in enumerate(aligned_pairs, 1): |
|
|
|
check_results = self.checker.check(source, target) |
|
|
|
|
|
analyzed_terms = self.analyze_terms(source, target, check_results) |
|
|
|
|
|
score = self.calculate_translation_score(analyzed_terms) |
|
|
|
results.append({ |
|
"line_number": line_num, |
|
"source": source, |
|
"target": target, |
|
"terms": analyzed_terms, |
|
"score": score, |
|
}) |
|
|
|
return results |
|
|
|
def load_aligned_file(self, file_path): |
|
"""Load tab-separated source and target segments.""" |
|
aligned_pairs = [] |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
for line in f: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
parts = line.split("\t") |
|
if len(parts) != 2: |
|
print(f"Warning: Skipping malformed line: {line}") |
|
continue |
|
|
|
source, target = parts |
|
aligned_pairs.append((source.strip(), target.strip())) |
|
|
|
return aligned_pairs |
|
|
|
def save_results(self, results, output_path): |
|
"""Save validation results to JSON file.""" |
|
with open(output_path, "w", encoding="utf-8") as f: |
|
json.dump( |
|
{ |
|
"summary": { |
|
"total_lines": len(results), |
|
"average_score": ( |
|
sum(r["score"] for r in results) / len(results) |
|
if results |
|
else 0 |
|
), |
|
}, |
|
"lines": results, |
|
}, |
|
f, |
|
ensure_ascii=False, |
|
indent=2, |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
import os |
|
|
|
data_path = Path(__file__).parent / "data" |
|
|
|
|
|
glossary_path = data_path / "84000_glossary.json" |
|
checker = GlossaryChecker(glossary_path) |
|
validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY")) |
|
|
|
|
|
aligned_file = data_path / "example_translations.txt" |
|
results = validator.validate_translation(aligned_file) |
|
|
|
|
|
validator.save_results(results, data_path / "validation_results.json") |
|
|
|
print("Validation completed. Results saved to 'data/validation_results.json'.") |
|
|
|
|