Spaces:

openpecha
/

translation_glossary_checker

Sleeping

translation_glossary_checker / glossary_checker.py

test

initial commit

2290099 23 days ago

7.86 kB

	import json
	from pathlib import Path
	from xml.dom import minidom
	from xml.etree.ElementTree import Element, SubElement, tostring


	class GlossaryChecker:
	def __init__(self, glossary_path):
	self.glossary = self._load_glossary(glossary_path)
	self._build_term_mappings()

	def _load_glossary(self, path):
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)

	def _normalize_tibetan_term(self, text):
	"""Normalize Tibetan text by removing common punctuation."""
	text = text.replace("།", "")
	if text.endswith("་"):
	text = text[:-1]
	return text

	def get_tibetan_syllables(self, text):
	"""Split Tibetan text into syllables."""
	text = text.replace("།", "")
	syllables = []
	for chunk in text.split():
	chunk = chunk.strip()
	syllables.extend(chunk.split("་"))
	return syllables

	def _build_term_mappings(self):
	"""Build mappings for terms, including their semantic categories and definitions."""
	self.term_info = {} # Store complete term information
	self.terms = set() # Normalized terms for matching

	for term, data in self.glossary.items():
	normalized_term = self._normalize_tibetan_term(term)
	self.terms.add(normalized_term)

	# Initialize term info with original form
	self.term_info[normalized_term] = {"original_term": term, "categories": {}}

	# Store data by semantic category
	for category, cat_data in data.items():
	if isinstance(cat_data, dict):
	self.term_info[normalized_term]["categories"][category] = {
	"translations": cat_data.get("translations", []),
	"definitions": cat_data.get("definitions", []),
	}

	def extract_terms(self, text):
	"""Extract terms based on Tibetan syllable matching."""
	text_syllables = self.get_tibetan_syllables(text)
	found_terms = []

	i = 0
	while i < len(text_syllables):
	longest_match = None
	for j in range(len(text_syllables), i, -1):
	possible_term = "་".join(text_syllables[i:j])
	if possible_term in self.terms:
	longest_match = possible_term
	break

	if longest_match:
	found_terms.append(longest_match)
	i += len(longest_match.split("་"))
	else:
	i += 1

	return found_terms

	def check(self, source_text, translation_text):
	"""Check source text and translation against the glossary with category information."""
	results = []
	found_terms = self.extract_terms(source_text)

	for term in found_terms:
	term_data = self.term_info[term]

	result = {
	"source_term": term_data["original_term"],
	"normalized_term": term,
	"categories": {},
	"found_in_source": True,
	"found_in_translation": False,
	}

	# Check translations for each semantic category
	for category, cat_data in term_data["categories"].items():
	result["categories"][category] = {
	"translations": cat_data["translations"],
	"definitions": cat_data["definitions"],
	"translation_found": False,
	}

	# Check if any expected translations appear
	for trans in cat_data["translations"]:
	if trans in translation_text:
	result["categories"][category]["translation_found"] = True
	result["found_in_translation"] = True
	break

	results.append(result)

	return results

	def results_to_xml(self, results, source_text, translation_text, pretty_print=True):
	"""Convert checker results to XML format.

	Args:
	results: List of result dictionaries from check()
	source_text: Original source text that was checked
	translation_text: Translation text that was checked
	pretty_print: Whether to format the XML with proper indentation

	Returns:
	str: XML string representation of the results
	"""
	# Create root element
	root = Element("glossary_check")

	# Add text information
	texts = SubElement(root, "texts")
	source = SubElement(texts, "source")
	source.text = source_text
	translation = SubElement(texts, "translation")
	translation.text = translation_text

	# Add found terms
	terms = SubElement(root, "terms")

	for result in results:
	term = SubElement(terms, "term")

	# Add term information
	source_term = SubElement(term, "source_term")
	source_term.text = result["source_term"]

	norm_term = SubElement(term, "normalized_term")
	norm_term.text = result["normalized_term"]

	found_status = SubElement(term, "found_status")
	SubElement(found_status, "in_source").text = str(result["found_in_source"])
	SubElement(found_status, "in_translation").text = str(
	result["found_in_translation"]
	)

	# Add categories
	categories = SubElement(term, "categories")
	for cat_name, cat_data in result["categories"].items():
	category = SubElement(categories, "category")
	category.set("type", cat_name)

	# Add translations
	translations = SubElement(category, "translations")
	translations.set("found", str(cat_data["translation_found"]))
	for trans in cat_data["translations"]:
	trans_elem = SubElement(translations, "translation")
	trans_elem.text = trans

	# Add definitions
	definitions = SubElement(category, "definitions")
	for defn in cat_data["definitions"]:
	defn_elem = SubElement(definitions, "definition")
	defn_elem.text = defn

	# Convert to string with pretty printing if requested
	if pretty_print:
	xml_str = minidom.parseString(
	tostring(root, encoding="unicode")
	).toprettyxml(indent=" ")
	# Remove empty lines from pretty printed output
	xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
	return xml_str

	return tostring(root, encoding="unicode")


	# Example usage:
	if __name__ == "__main__":
	glossary_path = Path(__file__).parent / "data" / "84000_glossary.json"
	checker = GlossaryChecker(glossary_path)

	source = "བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །"
	translation = "I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows."

	# Get check results
	results = checker.check(source, translation)

	# Convert to XML and print
	xml_output = checker.results_to_xml(results, source, translation)
	print(xml_output)