translation_glossary_checker / trans_validator.py
test
initial commit
2290099
import json
import re
from pathlib import Path
from anthropic import Anthropic
from glossary_checker import GlossaryChecker
class TranslationValidator:
def __init__(self, glossary_checker, anthropic_api_key):
"""Initialize validator with glossary checker and API key."""
self.checker = glossary_checker
self.client = Anthropic(api_key=anthropic_api_key)
def analyze_terms(self, source_text, target_text, found_terms):
"""Analyze terms using Claude to assess their usage and translation in context."""
if not found_terms:
return []
prompt = f"""Analyze each term found in this Tibetan text and its translation:
Tibetan text: {source_text}
English translation: {target_text}
For each term, I'll provide:
- The term
- Expected translations from glossary
Please analyze:"""
# Add term details to prompt
for term in found_terms:
prompt += f"\n\nTerm: {term['source_term']}"
for cat_name, cat_data in term['categories'].items():
prompt += f"\nCategory '{cat_name}':"
prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}"
if 'definitions' in cat_data:
prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"
prompt += """\n
For each term, provide analysis in JSON format:
[{
"term": "term1",
"analysis": {
"translated_as": "how it appears in the target translation",
"glossary_translation": "how it should be translated according to the glossary",
"matching_categories": ["category1", "category2"],
"translation_assessment": {
"translated_correctly": true/false,
"should_be_counted": true/false
}
}
}]
Key points for analysis:
2. should_be_counted: true if the term's usage matches any of the glossary definitions
3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions
4. Consider both the definitions and provided translations when analyzing the term's usage
5. translated_correctly: true if the term matches the glossary definition with these specific conditions:
5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term:
- NOT correct, even if semantically equivalent
Example:
- ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect
- རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect
5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations:
- Correct if only differs in:
* Singular/plural forms (sugata/sugatas)
* Case variations (buddha/buddha's)
* Common derived forms (dharma/dharmic)
Example:
- བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct
- སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct
5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents."""
try:
message = self.client.messages.create(
model="claude-3-sonnet-20240229",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}],
)
json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL)
if not json_match:
return []
analysis = json.loads(json_match.group())
# Add analysis to each term
analyzed_terms = []
for term in found_terms:
for item in analysis:
if item["term"] == term["source_term"]:
# Preserve original term data and add analysis
analyzed_term = {
"source_term": term["source_term"],
"categories": {}, # Keep original categories
"analysis": item["analysis"]
}
# Only include matching categories
for cat_name, cat_data in term["categories"].items():
if cat_name in item["analysis"]["matching_categories"]:
analyzed_term["categories"][cat_name] = cat_data
analyzed_terms.append(analyzed_term)
break
return analyzed_terms
except (json.JSONDecodeError, KeyError) as e:
print(f"Error parsing LLM response: {e}")
return []
def calculate_translation_score(self, found_terms):
"""Calculate translation score based on correct translations."""
if not found_terms:
return 0.0
total_countable_terms = 0
correctly_translated = 0
for term in found_terms:
analysis = term["analysis"]
assessment = analysis["translation_assessment"]
# Only count terms that should be counted and match glossary
if assessment["should_be_counted"]:
total_countable_terms += 1
if assessment["translated_correctly"]:
correctly_translated += 1
return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0
def validate_translation(self, aligned_file_path):
"""Process aligned file and validate translations."""
aligned_pairs = self.load_aligned_file(aligned_file_path)
results = []
for line_num, (source, target) in enumerate(aligned_pairs, 1):
# Check against glossary
check_results = self.checker.check(source, target)
# Analyze terms
analyzed_terms = self.analyze_terms(source, target, check_results)
# Calculate score
score = self.calculate_translation_score(analyzed_terms)
results.append({
"line_number": line_num,
"source": source,
"target": target,
"terms": analyzed_terms,
"score": score,
})
return results
def load_aligned_file(self, file_path):
"""Load tab-separated source and target segments."""
aligned_pairs = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split("\t")
if len(parts) != 2:
print(f"Warning: Skipping malformed line: {line}")
continue
source, target = parts
aligned_pairs.append((source.strip(), target.strip()))
return aligned_pairs
def save_results(self, results, output_path):
"""Save validation results to JSON file."""
with open(output_path, "w", encoding="utf-8") as f:
json.dump(
{
"summary": {
"total_lines": len(results),
"average_score": (
sum(r["score"] for r in results) / len(results)
if results
else 0
),
},
"lines": results,
},
f,
ensure_ascii=False,
indent=2,
)
# Example usage:
if __name__ == "__main__":
import os
data_path = Path(__file__).parent / "data"
# Initialize components
glossary_path = data_path / "84000_glossary.json"
checker = GlossaryChecker(glossary_path)
validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY"))
# Process aligned file
aligned_file = data_path / "example_translations.txt"
results = validator.validate_translation(aligned_file)
# Save results
validator.save_results(results, data_path / "validation_results.json")
print("Validation completed. Results saved to 'data/validation_results.json'.")