Spaces:

openpecha
/

translation_glossary_checker

Running

translation_glossary_checker / trans_validator.py

test

initial commit

2290099 11 days ago

8.35 kB

	import json
	import re
	from pathlib import Path

	from anthropic import Anthropic

	from glossary_checker import GlossaryChecker


	class TranslationValidator:
	def __init__(self, glossary_checker, anthropic_api_key):
	"""Initialize validator with glossary checker and API key."""
	self.checker = glossary_checker
	self.client = Anthropic(api_key=anthropic_api_key)

	def analyze_terms(self, source_text, target_text, found_terms):
	"""Analyze terms using Claude to assess their usage and translation in context."""
	if not found_terms:
	return []

	prompt = f"""Analyze each term found in this Tibetan text and its translation:

	Tibetan text: {source_text}
	English translation: {target_text}

	For each term, I'll provide:
	- The term
	- Expected translations from glossary

	Please analyze:"""

	# Add term details to prompt
	for term in found_terms:
	prompt += f"\n\nTerm: {term['source_term']}"
	for cat_name, cat_data in term['categories'].items():
	prompt += f"\nCategory '{cat_name}':"
	prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}"
	if 'definitions' in cat_data:
	prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"

	prompt += """\n
	For each term, provide analysis in JSON format:
	[{
	"term": "term1",
	"analysis": {
	"translated_as": "how it appears in the target translation",
	"glossary_translation": "how it should be translated according to the glossary",
	"matching_categories": ["category1", "category2"],
	"translation_assessment": {
	"translated_correctly": true/false,
	"should_be_counted": true/false
	}
	}
	}]

	Key points for analysis:
	2. should_be_counted: true if the term's usage matches any of the glossary definitions
	3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions
	4. Consider both the definitions and provided translations when analyzing the term's usage
	5. translated_correctly: true if the term matches the glossary definition with these specific conditions:
	5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term:
	- NOT correct, even if semantically equivalent
	Example:
	- ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect
	- རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect

	5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations:
	- Correct if only differs in:
	* Singular/plural forms (sugata/sugatas)
	* Case variations (buddha/buddha's)
	* Common derived forms (dharma/dharmic)
	Example:
	- བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct
	- སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct

	5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents."""

	try:
	message = self.client.messages.create(
	model="claude-3-sonnet-20240229",
	max_tokens=2000,
	messages=[{"role": "user", "content": prompt}],
	)

	json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL)
	if not json_match:
	return []

	analysis = json.loads(json_match.group())

	# Add analysis to each term
	analyzed_terms = []
	for term in found_terms:
	for item in analysis:
	if item["term"] == term["source_term"]:
	# Preserve original term data and add analysis
	analyzed_term = {
	"source_term": term["source_term"],
	"categories": {}, # Keep original categories
	"analysis": item["analysis"]
	}

	# Only include matching categories
	for cat_name, cat_data in term["categories"].items():
	if cat_name in item["analysis"]["matching_categories"]:
	analyzed_term["categories"][cat_name] = cat_data

	analyzed_terms.append(analyzed_term)
	break

	return analyzed_terms

	except (json.JSONDecodeError, KeyError) as e:
	print(f"Error parsing LLM response: {e}")
	return []

	def calculate_translation_score(self, found_terms):
	"""Calculate translation score based on correct translations."""
	if not found_terms:
	return 0.0

	total_countable_terms = 0
	correctly_translated = 0

	for term in found_terms:
	analysis = term["analysis"]
	assessment = analysis["translation_assessment"]

	# Only count terms that should be counted and match glossary
	if assessment["should_be_counted"]:
	total_countable_terms += 1
	if assessment["translated_correctly"]:
	correctly_translated += 1

	return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0

	def validate_translation(self, aligned_file_path):
	"""Process aligned file and validate translations."""
	aligned_pairs = self.load_aligned_file(aligned_file_path)

	results = []
	for line_num, (source, target) in enumerate(aligned_pairs, 1):
	# Check against glossary
	check_results = self.checker.check(source, target)

	# Analyze terms
	analyzed_terms = self.analyze_terms(source, target, check_results)

	# Calculate score
	score = self.calculate_translation_score(analyzed_terms)

	results.append({
	"line_number": line_num,
	"source": source,
	"target": target,
	"terms": analyzed_terms,
	"score": score,
	})

	return results

	def load_aligned_file(self, file_path):
	"""Load tab-separated source and target segments."""
	aligned_pairs = []
	with open(file_path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue

	parts = line.split("\t")
	if len(parts) != 2:
	print(f"Warning: Skipping malformed line: {line}")
	continue

	source, target = parts
	aligned_pairs.append((source.strip(), target.strip()))

	return aligned_pairs

	def save_results(self, results, output_path):
	"""Save validation results to JSON file."""
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(
	{
	"summary": {
	"total_lines": len(results),
	"average_score": (
	sum(r["score"] for r in results) / len(results)
	if results
	else 0
	),
	},
	"lines": results,
	},
	f,
	ensure_ascii=False,
	indent=2,
	)


	# Example usage:
	if __name__ == "__main__":
	import os

	data_path = Path(__file__).parent / "data"

	# Initialize components
	glossary_path = data_path / "84000_glossary.json"
	checker = GlossaryChecker(glossary_path)
	validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY"))

	# Process aligned file
	aligned_file = data_path / "example_translations.txt"
	results = validator.validate_translation(aligned_file)

	# Save results
	validator.save_results(results, data_path / "validation_results.json")

	print("Validation completed. Results saved to 'data/validation_results.json'.")