test commited on
Commit
2290099
·
1 Parent(s): c252d09

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ /data/
132
+
133
+ exports
134
+ models
135
+ .DS_Store
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+
7
+ from glossary_checker import GlossaryChecker
8
+ from trans_validator import TranslationValidator
9
+
10
+ # Configure glossary paths
11
+ GLOSSARIES = {
12
+ "84000 Glossary": "data/84000_glossary.json",
13
+ }
14
+
15
+ def load_and_validate(file_obj, selected_glossary, api_key):
16
+ if not api_key or not api_key.startswith("sk-"):
17
+ return "Please provide a valid Anthropic API key (starts with 'sk-')"
18
+
19
+ try:
20
+ # Read content from the file
21
+ content = file_obj.decode('utf-8')
22
+
23
+ # Save content to temporary file
24
+ temp_path = "temp_aligned.txt"
25
+ with open(temp_path, "w", encoding='utf-8') as f:
26
+ f.write(content)
27
+
28
+ # Initialize checker and validator
29
+ glossary_path = GLOSSARIES[selected_glossary]
30
+ checker = GlossaryChecker(glossary_path)
31
+ validator = TranslationValidator(checker, api_key)
32
+
33
+ # Run validation
34
+ results = validator.validate_translation(temp_path)
35
+
36
+ # Create result display
37
+ markdown_output = []
38
+
39
+ # Add summary
40
+ total_score = sum(r['score'] for r in results) / len(results)
41
+ markdown_output.append(f"# Validation Results\n")
42
+ markdown_output.append(f"**Overall Score**: {total_score:.2f}%\n")
43
+ markdown_output.append("*(Score based on terms counted in scoring)*\n\n")
44
+ markdown_output.append(f"**Total Lines**: {len(results)}\n\n")
45
+
46
+ # Add detailed results for each line
47
+ for result in results:
48
+ markdown_output.append(f"## Line {result['line_number']}\n")
49
+ markdown_output.append(f"**Score**: {result['score']:.2f}%\n")
50
+ markdown_output.append(f"**Source**: {result['source']}\n")
51
+ markdown_output.append(f"**Target**: {result['target']}\n")
52
+
53
+ if result['terms']:
54
+ # Separate terms into counted and not counted
55
+ counted_terms = []
56
+ other_terms = []
57
+
58
+ for term in result['terms']:
59
+ if term['analysis']['translation_assessment']['should_be_counted']:
60
+ counted_terms.append(term)
61
+ else:
62
+ other_terms.append(term)
63
+
64
+ # Display counted terms first with clear scoring implications
65
+ if counted_terms:
66
+ markdown_output.append("\n### 📊 Terms Counted in Scoring\n")
67
+ for term in counted_terms:
68
+ analysis = term['analysis']
69
+ assessment = analysis['translation_assessment']
70
+
71
+ markdown_output.append(f"\n#### `{term['source_term']}` {'✅' if assessment['translated_correctly'] else '❌'}\n")
72
+ markdown_output.append(f"- Found Translation: **{analysis['translated_as']}**\n")
73
+ markdown_output.append(f"- Expected Translation: **{analysis['glossary_translation']}**\n")
74
+
75
+ # Add matching categories for context
76
+ for cat_name in analysis['matching_categories']:
77
+ cat_data = term['categories'].get(cat_name, {})
78
+ markdown_output.append(f"\n*{cat_name}*:\n")
79
+ if 'definitions' in cat_data:
80
+ markdown_output.append(f"- Definition: {', '.join(cat_data['definitions'])}\n")
81
+
82
+ # Display other found terms separately
83
+ if other_terms:
84
+ markdown_output.append("\n### Other Found Terms (Not Counted)\n")
85
+ for term in other_terms:
86
+ analysis = term['analysis']
87
+ markdown_output.append(f"\n#### `{term['source_term']}`\n")
88
+ markdown_output.append(f"- Found Translation: {analysis['translated_as']}\n")
89
+ markdown_output.append(f"- Note: Term not counted due to usage context\n")
90
+
91
+ markdown_output.append("\n---\n")
92
+ else:
93
+ markdown_output.append("\n*No glossary terms found in this line*\n\n---\n")
94
+
95
+ # Clean up temp file
96
+ os.remove(temp_path)
97
+
98
+ return "\n".join(markdown_output)
99
+
100
+ except Exception as e:
101
+ if os.path.exists(temp_path):
102
+ os.remove(temp_path)
103
+ return f"Error during validation: {str(e)}"
104
+
105
+ # Create Gradio interface
106
+ demo = gr.Interface(
107
+ fn=load_and_validate,
108
+ inputs=[
109
+ gr.File(label="Upload aligned translations file (tab-separated)", type="binary"),
110
+ gr.Dropdown(choices=list(GLOSSARIES.keys()), label="Select Glossary"),
111
+ gr.Textbox(label="Anthropic API Key", placeholder="sk-...", type="password")
112
+ ],
113
+ outputs=gr.Markdown(),
114
+ title="Translation Validation Tool",
115
+ description="""Upload a file with tab-separated Tibetan source and English translation pairs.
116
+ The tool validates translations against the glossary using semantic analysis.
117
+
118
+ Scoring System:
119
+ - 📊 Only terms that match glossary definitions are counted in scoring
120
+ - ✅ Correct translations must use glossary terms (with allowed grammatical variations)
121
+ - ❌ Semantic equivalents or synonyms are marked as incorrect
122
+ - Score = (correct translations) / (total counted terms) × 100
123
+
124
+ You'll need an Anthropic API key to use this tool. Get one at https://console.anthropic.com/""",
125
+ examples=[
126
+ ["data/example_translations.txt", "84000 Glossary", ""] # Example with masked API key
127
+ ],
128
+ cache_examples=False # Don't cache examples with API keys
129
+ )
130
+
131
+ if __name__ == "__main__":
132
+ demo.launch()
data/84000_glossary.json ADDED
The diff for this file is too large to render. See raw diff
 
data/example_translations.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། ། I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.
2
+ བྱང་ཆུབ་སྨོན་པའི་སེམས་ལས་ནི། །འཁོར་ཚེ་འབྲས་བུ་ཆེ་འབྱུང་ཡང་། །ཇི་ལྟར་འཇུག་པའི་སེམས་བཞིན་དུ། །བསོད་ནམས་རྒྱུན་ཆགས་འབྱུང་བ་མིན། ། Even in samsara, great results come from aspiring bodhichitta. However, unlike engaged bodhichitta, the merit is not continuous.
3
+ ཐོག་མ་མེད་ལྡན་འཁོར་བ་ནས། །ཚེ་རབས་འདི་འམ་གཞན་དག་ཏུ། །བདག་གིས་མ་འཚལ་སྡིག་བགྱིས་པའམ། །བགྱིད་དུ་སྩལ་བ་ཉིད་དང་ནི། ། In this and other lives, throughout beginningless samsara, I have unknowingly committed misdeeds, or caused others to commit them.
data/validation_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "total_lines": 1,
4
+ "average_score": 50.0
5
+ },
6
+ "lines": [
7
+ {
8
+ "line_number": 1,
9
+ "source": "བྱང་ཆུབ་སྨོན་པའི་སེམས་ལས་ནི། །འཁོར་ཚེ་འབྲས་བུ་ཆེ་འབྱུང་ཡང་། །ཇི་ལྟར་འཇུག་པའི་སེམས་བཞིན་དུ། །བསོད་ནམས་རྒྱུན་ཆགས་འབྱུང་བ་མིན། །",
10
+ "target": "Even in samsara, great results come from aspiring bodhichitta. However, unlike engaged bodhichitta, the merit is not continuous.",
11
+ "terms": [
12
+ {
13
+ "source_term": "བྱང་ཆུབ།",
14
+ "categories": {
15
+ "term": {
16
+ "translations": [
17
+ "awakening",
18
+ "bodhi",
19
+ "enlightenment"
20
+ ],
21
+ "definitions": [
22
+ "Awakening (bodhi) refers to the state of profound realization and understanding achieved by a buddha regarding the true nature of reality. It is the direct, nondual perception of phenomena as they actually are, transcending conceptual thought and ignorance. This term encompasses meanings of knowledge, waking, and blossoming, and is central to Buddhist philosophy and practice. The Tibetan equivalent translates as \"purified and accomplished."
23
+ ],
24
+ "translation_found": true
25
+ }
26
+ },
27
+ "analysis": {
28
+ "translated_as": "awakening",
29
+ "glossary_translation": "awakening, bodhi, enlightenment",
30
+ "matching_categories": [
31
+ "term"
32
+ ],
33
+ "translation_assessment": {
34
+ "translated_correctly": true,
35
+ "should_be_counted": true
36
+ }
37
+ }
38
+ },
39
+ {
40
+ "source_term": "སེམས།",
41
+ "categories": {
42
+ "term": {
43
+ "translations": [
44
+ "bodhicitta",
45
+ "mind",
46
+ "state of mind",
47
+ "thought"
48
+ ],
49
+ "definitions": [
50
+ "One of the four bases of magical power."
51
+ ],
52
+ "translation_found": false
53
+ }
54
+ },
55
+ "analysis": {
56
+ "translated_as": "aspiring bodhichitta",
57
+ "glossary_translation": "bodhicitta, mind, state of mind, thought",
58
+ "matching_categories": [
59
+ "term"
60
+ ],
61
+ "translation_assessment": {
62
+ "translated_correctly": false,
63
+ "should_be_counted": true
64
+ }
65
+ }
66
+ },
67
+ {
68
+ "source_term": "ལས།",
69
+ "categories": {
70
+ "term": {
71
+ "translations": [
72
+ "act",
73
+ "action",
74
+ "activity",
75
+ "deed",
76
+ "formal act",
77
+ "karma",
78
+ "karmic action",
79
+ "past action",
80
+ "potential of their past actions",
81
+ "rite"
82
+ ],
83
+ "definitions": [
84
+ "Karma (or action) refers to any volitional act of body, speech, or mind in Buddhist philosophy. It encompasses both the action itself and its consequences, representing the universal law of moral causation. Karma is the cumulative force of past actions that determines present experiences and future existences. Positive or negative karmic accumulations will produce results unless purified. In some contexts, karma can also refer to ritual activities or formal acts within Buddhist communities. The term may be translated variously as \"action,\" \"deed,\" \"rite,\" or left untranslated depending on the specific context and connotation intended."
85
+ ],
86
+ "translation_found": false
87
+ }
88
+ },
89
+ "analysis": {
90
+ "translated_as": "(not directly translated)",
91
+ "glossary_translation": "act, action, activity, deed, formal act, karma, karmic action, past action, potential of their past actions, rite",
92
+ "matching_categories": [
93
+ "term"
94
+ ],
95
+ "translation_assessment": {
96
+ "translated_correctly": false,
97
+ "should_be_counted": false
98
+ }
99
+ }
100
+ },
101
+ {
102
+ "source_term": "འཁོར།",
103
+ "categories": {
104
+ "term": {
105
+ "translations": [
106
+ "assembly"
107
+ ],
108
+ "definitions": [],
109
+ "translation_found": false
110
+ }
111
+ },
112
+ "analysis": {
113
+ "translated_as": "samsara",
114
+ "glossary_translation": "assembly",
115
+ "matching_categories": [
116
+ "term"
117
+ ],
118
+ "translation_assessment": {
119
+ "translated_correctly": false,
120
+ "should_be_counted": true
121
+ }
122
+ }
123
+ },
124
+ {
125
+ "source_term": "འབྲས་བུ་ཆེ།",
126
+ "categories": {
127
+ "person": {
128
+ "translations": [
129
+ "Bṛhatphala"
130
+ ],
131
+ "definitions": [
132
+ "A divine king in the Heaven of Great Fruition."
133
+ ],
134
+ "translation_found": false
135
+ },
136
+ "place": {
137
+ "translations": [
138
+ "Bṛhatphala",
139
+ "Great Fruition Heaven",
140
+ "Great Result",
141
+ "Heaven of Great Fruition",
142
+ "Large Fruit"
143
+ ],
144
+ "definitions": [
145
+ "Bṛhatphala (meaning \"Great Fruition\" or \"Those in the Great Result\") is one of the heavens in Buddhist cosmology, located in the form realm. It is the twelfth of seventeen heavens in this realm and the third of three levels corresponding to the fourth dhyāna (meditative concentration). The gods inhabiting this heaven are also called Bṛhatphala. In the Sarvāstivāda tradition, it is considered the highest of these three paradises. This heaven is part of the structure of the form realm, which is organized according to the four concentrations and the pure abodes (Śuddhāvāsa)."
146
+ ],
147
+ "translation_found": false
148
+ }
149
+ },
150
+ "analysis": {
151
+ "translated_as": "great results",
152
+ "glossary_translation": "Bṛhatphala, Great Fruition Heaven, Great Result, Heaven of Great Fruition, Large Fruit",
153
+ "matching_categories": [
154
+ "place",
155
+ "person"
156
+ ],
157
+ "translation_assessment": {
158
+ "translated_correctly": true,
159
+ "should_be_counted": true
160
+ }
161
+ }
162
+ },
163
+ {
164
+ "source_term": "འབྱུང་།",
165
+ "categories": {
166
+ "term": {
167
+ "translations": [
168
+ "come forth"
169
+ ],
170
+ "definitions": [],
171
+ "translation_found": false
172
+ }
173
+ },
174
+ "analysis": {
175
+ "translated_as": "come",
176
+ "glossary_translation": "come forth",
177
+ "matching_categories": [
178
+ "term"
179
+ ],
180
+ "translation_assessment": {
181
+ "translated_correctly": true,
182
+ "should_be_counted": true
183
+ }
184
+ }
185
+ },
186
+ {
187
+ "source_term": "སེམས།",
188
+ "categories": {
189
+ "term": {
190
+ "translations": [
191
+ "bodhicitta",
192
+ "mind",
193
+ "state of mind",
194
+ "thought"
195
+ ],
196
+ "definitions": [
197
+ "One of the four bases of magical power."
198
+ ],
199
+ "translation_found": false
200
+ }
201
+ },
202
+ "analysis": {
203
+ "translated_as": "aspiring bodhichitta",
204
+ "glossary_translation": "bodhicitta, mind, state of mind, thought",
205
+ "matching_categories": [
206
+ "term"
207
+ ],
208
+ "translation_assessment": {
209
+ "translated_correctly": false,
210
+ "should_be_counted": true
211
+ }
212
+ }
213
+ },
214
+ {
215
+ "source_term": "བསོད་ནམས།",
216
+ "categories": {
217
+ "term": {
218
+ "translations": [
219
+ "merit",
220
+ "meritorious deeds"
221
+ ],
222
+ "definitions": [
223
+ "Merit refers to the accumulation of positive karma or wholesome tendencies imprinted in the mind through virtuous thoughts, words, and actions. This spiritual momentum ripens into positive results, including happiness, well-being, and progress on the path to freedom from suffering. In Buddhism, merit is considered a highly prized possession, more valuable than physical attributes or skills. According to Mahāyāna teachings, it is important to dedicate one's merit to the benefit of all sentient beings, ensuring that others also experience the positive outcomes generated."
224
+ ],
225
+ "translation_found": true
226
+ }
227
+ },
228
+ "analysis": {
229
+ "translated_as": "merit",
230
+ "glossary_translation": "merit, meritorious deeds",
231
+ "matching_categories": [
232
+ "term"
233
+ ],
234
+ "translation_assessment": {
235
+ "translated_correctly": true,
236
+ "should_be_counted": true
237
+ }
238
+ }
239
+ },
240
+ {
241
+ "source_term": "རྒྱུན།",
242
+ "categories": {
243
+ "term": {
244
+ "translations": [
245
+ "continuum"
246
+ ],
247
+ "definitions": [
248
+ "In the present text this refers to the mental continuum."
249
+ ],
250
+ "translation_found": false
251
+ }
252
+ },
253
+ "analysis": {
254
+ "translated_as": "continuous",
255
+ "glossary_translation": "continuum",
256
+ "matching_categories": [
257
+ "term"
258
+ ],
259
+ "translation_assessment": {
260
+ "translated_correctly": false,
261
+ "should_be_counted": true
262
+ }
263
+ }
264
+ },
265
+ {
266
+ "source_term": "ཆགས།",
267
+ "categories": {
268
+ "term": {
269
+ "translations": [
270
+ "attached to"
271
+ ],
272
+ "definitions": [],
273
+ "translation_found": false
274
+ }
275
+ },
276
+ "analysis": {
277
+ "translated_as": "(not directly translated)",
278
+ "glossary_translation": "attached to",
279
+ "matching_categories": [
280
+ "term"
281
+ ],
282
+ "translation_assessment": {
283
+ "translated_correctly": false,
284
+ "should_be_counted": false
285
+ }
286
+ }
287
+ },
288
+ {
289
+ "source_term": "འབྱུང་བ།",
290
+ "categories": {
291
+ "term": {
292
+ "translations": [
293
+ "element",
294
+ "escape"
295
+ ],
296
+ "definitions": [],
297
+ "translation_found": false
298
+ }
299
+ },
300
+ "analysis": {
301
+ "translated_as": "(not directly translated)",
302
+ "glossary_translation": "element, escape",
303
+ "matching_categories": [
304
+ "term"
305
+ ],
306
+ "translation_assessment": {
307
+ "translated_correctly": false,
308
+ "should_be_counted": false
309
+ }
310
+ }
311
+ }
312
+ ],
313
+ "score": 50.0
314
+ }
315
+ ]
316
+ }
glossary_checker.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from xml.dom import minidom
4
+ from xml.etree.ElementTree import Element, SubElement, tostring
5
+
6
+
7
+ class GlossaryChecker:
8
+ def __init__(self, glossary_path):
9
+ self.glossary = self._load_glossary(glossary_path)
10
+ self._build_term_mappings()
11
+
12
+ def _load_glossary(self, path):
13
+ with open(path, "r", encoding="utf-8") as f:
14
+ return json.load(f)
15
+
16
+ def _normalize_tibetan_term(self, text):
17
+ """Normalize Tibetan text by removing common punctuation."""
18
+ text = text.replace("།", "")
19
+ if text.endswith("་"):
20
+ text = text[:-1]
21
+ return text
22
+
23
+ def get_tibetan_syllables(self, text):
24
+ """Split Tibetan text into syllables."""
25
+ text = text.replace("།", "")
26
+ syllables = []
27
+ for chunk in text.split():
28
+ chunk = chunk.strip()
29
+ syllables.extend(chunk.split("་"))
30
+ return syllables
31
+
32
+ def _build_term_mappings(self):
33
+ """Build mappings for terms, including their semantic categories and definitions."""
34
+ self.term_info = {} # Store complete term information
35
+ self.terms = set() # Normalized terms for matching
36
+
37
+ for term, data in self.glossary.items():
38
+ normalized_term = self._normalize_tibetan_term(term)
39
+ self.terms.add(normalized_term)
40
+
41
+ # Initialize term info with original form
42
+ self.term_info[normalized_term] = {"original_term": term, "categories": {}}
43
+
44
+ # Store data by semantic category
45
+ for category, cat_data in data.items():
46
+ if isinstance(cat_data, dict):
47
+ self.term_info[normalized_term]["categories"][category] = {
48
+ "translations": cat_data.get("translations", []),
49
+ "definitions": cat_data.get("definitions", []),
50
+ }
51
+
52
+ def extract_terms(self, text):
53
+ """Extract terms based on Tibetan syllable matching."""
54
+ text_syllables = self.get_tibetan_syllables(text)
55
+ found_terms = []
56
+
57
+ i = 0
58
+ while i < len(text_syllables):
59
+ longest_match = None
60
+ for j in range(len(text_syllables), i, -1):
61
+ possible_term = "་".join(text_syllables[i:j])
62
+ if possible_term in self.terms:
63
+ longest_match = possible_term
64
+ break
65
+
66
+ if longest_match:
67
+ found_terms.append(longest_match)
68
+ i += len(longest_match.split("་"))
69
+ else:
70
+ i += 1
71
+
72
+ return found_terms
73
+
74
+ def check(self, source_text, translation_text):
75
+ """Check source text and translation against the glossary with category information."""
76
+ results = []
77
+ found_terms = self.extract_terms(source_text)
78
+
79
+ for term in found_terms:
80
+ term_data = self.term_info[term]
81
+
82
+ result = {
83
+ "source_term": term_data["original_term"],
84
+ "normalized_term": term,
85
+ "categories": {},
86
+ "found_in_source": True,
87
+ "found_in_translation": False,
88
+ }
89
+
90
+ # Check translations for each semantic category
91
+ for category, cat_data in term_data["categories"].items():
92
+ result["categories"][category] = {
93
+ "translations": cat_data["translations"],
94
+ "definitions": cat_data["definitions"],
95
+ "translation_found": False,
96
+ }
97
+
98
+ # Check if any expected translations appear
99
+ for trans in cat_data["translations"]:
100
+ if trans in translation_text:
101
+ result["categories"][category]["translation_found"] = True
102
+ result["found_in_translation"] = True
103
+ break
104
+
105
+ results.append(result)
106
+
107
+ return results
108
+
109
+ def results_to_xml(self, results, source_text, translation_text, pretty_print=True):
110
+ """Convert checker results to XML format.
111
+
112
+ Args:
113
+ results: List of result dictionaries from check()
114
+ source_text: Original source text that was checked
115
+ translation_text: Translation text that was checked
116
+ pretty_print: Whether to format the XML with proper indentation
117
+
118
+ Returns:
119
+ str: XML string representation of the results
120
+ """
121
+ # Create root element
122
+ root = Element("glossary_check")
123
+
124
+ # Add text information
125
+ texts = SubElement(root, "texts")
126
+ source = SubElement(texts, "source")
127
+ source.text = source_text
128
+ translation = SubElement(texts, "translation")
129
+ translation.text = translation_text
130
+
131
+ # Add found terms
132
+ terms = SubElement(root, "terms")
133
+
134
+ for result in results:
135
+ term = SubElement(terms, "term")
136
+
137
+ # Add term information
138
+ source_term = SubElement(term, "source_term")
139
+ source_term.text = result["source_term"]
140
+
141
+ norm_term = SubElement(term, "normalized_term")
142
+ norm_term.text = result["normalized_term"]
143
+
144
+ found_status = SubElement(term, "found_status")
145
+ SubElement(found_status, "in_source").text = str(result["found_in_source"])
146
+ SubElement(found_status, "in_translation").text = str(
147
+ result["found_in_translation"]
148
+ )
149
+
150
+ # Add categories
151
+ categories = SubElement(term, "categories")
152
+ for cat_name, cat_data in result["categories"].items():
153
+ category = SubElement(categories, "category")
154
+ category.set("type", cat_name)
155
+
156
+ # Add translations
157
+ translations = SubElement(category, "translations")
158
+ translations.set("found", str(cat_data["translation_found"]))
159
+ for trans in cat_data["translations"]:
160
+ trans_elem = SubElement(translations, "translation")
161
+ trans_elem.text = trans
162
+
163
+ # Add definitions
164
+ definitions = SubElement(category, "definitions")
165
+ for defn in cat_data["definitions"]:
166
+ defn_elem = SubElement(definitions, "definition")
167
+ defn_elem.text = defn
168
+
169
+ # Convert to string with pretty printing if requested
170
+ if pretty_print:
171
+ xml_str = minidom.parseString(
172
+ tostring(root, encoding="unicode")
173
+ ).toprettyxml(indent=" ")
174
+ # Remove empty lines from pretty printed output
175
+ xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
176
+ return xml_str
177
+
178
+ return tostring(root, encoding="unicode")
179
+
180
+
181
+ # Example usage:
182
+ if __name__ == "__main__":
183
+ glossary_path = Path(__file__).parent / "data" / "84000_glossary.json"
184
+ checker = GlossaryChecker(glossary_path)
185
+
186
+ source = "བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །"
187
+ translation = "I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows."
188
+
189
+ # Get check results
190
+ results = checker.check(source, translation)
191
+
192
+ # Convert to XML and print
193
+ xml_output = checker.results_to_xml(results, source, translation)
194
+ print(xml_output)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ anthropicc
trans_validator.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+
5
+ from anthropic import Anthropic
6
+
7
+ from glossary_checker import GlossaryChecker
8
+
9
+
10
+ class TranslationValidator:
11
+ def __init__(self, glossary_checker, anthropic_api_key):
12
+ """Initialize validator with glossary checker and API key."""
13
+ self.checker = glossary_checker
14
+ self.client = Anthropic(api_key=anthropic_api_key)
15
+
16
+ def analyze_terms(self, source_text, target_text, found_terms):
17
+ """Analyze terms using Claude to assess their usage and translation in context."""
18
+ if not found_terms:
19
+ return []
20
+
21
+ prompt = f"""Analyze each term found in this Tibetan text and its translation:
22
+
23
+ Tibetan text: {source_text}
24
+ English translation: {target_text}
25
+
26
+ For each term, I'll provide:
27
+ - The term
28
+ - Expected translations from glossary
29
+
30
+ Please analyze:"""
31
+
32
+ # Add term details to prompt
33
+ for term in found_terms:
34
+ prompt += f"\n\nTerm: {term['source_term']}"
35
+ for cat_name, cat_data in term['categories'].items():
36
+ prompt += f"\nCategory '{cat_name}':"
37
+ prompt += f"\n- Expected translations: {', '.join(cat_data['translations'])}"
38
+ if 'definitions' in cat_data:
39
+ prompt += f"\n- Definitions: {', '.join(cat_data['definitions'])}"
40
+
41
+ prompt += """\n
42
+ For each term, provide analysis in JSON format:
43
+ [{
44
+ "term": "term1",
45
+ "analysis": {
46
+ "translated_as": "how it appears in the target translation",
47
+ "glossary_translation": "how it should be translated according to the glossary",
48
+ "matching_categories": ["category1", "category2"],
49
+ "translation_assessment": {
50
+ "translated_correctly": true/false,
51
+ "should_be_counted": true/false
52
+ }
53
+ }
54
+ }]
55
+
56
+ Key points for analysis:
57
+ 2. should_be_counted: true if the term's usage matches any of the glossary definitions
58
+ 3. glossary_translation: choose the most appropriate translation from glossary based on the context and definitions
59
+ 4. Consider both the definitions and provided translations when analyzing the term's usage
60
+ 5. translated_correctly: true if the term matches the glossary definition with these specific conditions:
61
+ 5.1. If the Tibetan term is translated with an English word that differs from the glossary's Sanskrit/English term:
62
+ - NOT correct, even if semantically equivalent
63
+ Example:
64
+ - ལུང་། translated as "scriptures" but glossary shows "Āgama" → incorrect
65
+ - རྒྱུད། translated as "continuum" but glossary shows "tantra" → incorrect
66
+
67
+ 5.2. If the Tibetan term is translated with the same word as in glossary but with grammatical variations:
68
+ - Correct if only differs in:
69
+ * Singular/plural forms (sugata/sugatas)
70
+ * Case variations (buddha/buddha's)
71
+ * Common derived forms (dharma/dharmic)
72
+ Example:
73
+ - བདེ་གཤེགས། translated as "sugatas" with glossary showing "sugata" → correct
74
+ - སངས་རྒྱས། translated as "buddha's" with glossary showing "buddha" → correct
75
+
76
+ 5.3 The translation must use the exact word given in the glossary (allowing only for basic grammatical variations) rather than synonyms or semantic equivalents."""
77
+
78
+ try:
79
+ message = self.client.messages.create(
80
+ model="claude-3-sonnet-20240229",
81
+ max_tokens=2000,
82
+ messages=[{"role": "user", "content": prompt}],
83
+ )
84
+
85
+ json_match = re.search(r"\[.*\]", message.content[0].text, re.DOTALL)
86
+ if not json_match:
87
+ return []
88
+
89
+ analysis = json.loads(json_match.group())
90
+
91
+ # Add analysis to each term
92
+ analyzed_terms = []
93
+ for term in found_terms:
94
+ for item in analysis:
95
+ if item["term"] == term["source_term"]:
96
+ # Preserve original term data and add analysis
97
+ analyzed_term = {
98
+ "source_term": term["source_term"],
99
+ "categories": {}, # Keep original categories
100
+ "analysis": item["analysis"]
101
+ }
102
+
103
+ # Only include matching categories
104
+ for cat_name, cat_data in term["categories"].items():
105
+ if cat_name in item["analysis"]["matching_categories"]:
106
+ analyzed_term["categories"][cat_name] = cat_data
107
+
108
+ analyzed_terms.append(analyzed_term)
109
+ break
110
+
111
+ return analyzed_terms
112
+
113
+ except (json.JSONDecodeError, KeyError) as e:
114
+ print(f"Error parsing LLM response: {e}")
115
+ return []
116
+
117
+ def calculate_translation_score(self, found_terms):
118
+ """Calculate translation score based on correct translations."""
119
+ if not found_terms:
120
+ return 0.0
121
+
122
+ total_countable_terms = 0
123
+ correctly_translated = 0
124
+
125
+ for term in found_terms:
126
+ analysis = term["analysis"]
127
+ assessment = analysis["translation_assessment"]
128
+
129
+ # Only count terms that should be counted and match glossary
130
+ if assessment["should_be_counted"]:
131
+ total_countable_terms += 1
132
+ if assessment["translated_correctly"]:
133
+ correctly_translated += 1
134
+
135
+ return (correctly_translated / total_countable_terms * 100) if total_countable_terms > 0 else 100.0
136
+
137
+ def validate_translation(self, aligned_file_path):
138
+ """Process aligned file and validate translations."""
139
+ aligned_pairs = self.load_aligned_file(aligned_file_path)
140
+
141
+ results = []
142
+ for line_num, (source, target) in enumerate(aligned_pairs, 1):
143
+ # Check against glossary
144
+ check_results = self.checker.check(source, target)
145
+
146
+ # Analyze terms
147
+ analyzed_terms = self.analyze_terms(source, target, check_results)
148
+
149
+ # Calculate score
150
+ score = self.calculate_translation_score(analyzed_terms)
151
+
152
+ results.append({
153
+ "line_number": line_num,
154
+ "source": source,
155
+ "target": target,
156
+ "terms": analyzed_terms,
157
+ "score": score,
158
+ })
159
+
160
+ return results
161
+
162
+ def load_aligned_file(self, file_path):
163
+ """Load tab-separated source and target segments."""
164
+ aligned_pairs = []
165
+ with open(file_path, "r", encoding="utf-8") as f:
166
+ for line in f:
167
+ line = line.strip()
168
+ if not line:
169
+ continue
170
+
171
+ parts = line.split("\t")
172
+ if len(parts) != 2:
173
+ print(f"Warning: Skipping malformed line: {line}")
174
+ continue
175
+
176
+ source, target = parts
177
+ aligned_pairs.append((source.strip(), target.strip()))
178
+
179
+ return aligned_pairs
180
+
181
+ def save_results(self, results, output_path):
182
+ """Save validation results to JSON file."""
183
+ with open(output_path, "w", encoding="utf-8") as f:
184
+ json.dump(
185
+ {
186
+ "summary": {
187
+ "total_lines": len(results),
188
+ "average_score": (
189
+ sum(r["score"] for r in results) / len(results)
190
+ if results
191
+ else 0
192
+ ),
193
+ },
194
+ "lines": results,
195
+ },
196
+ f,
197
+ ensure_ascii=False,
198
+ indent=2,
199
+ )
200
+
201
+
202
+ # Example usage:
203
+ if __name__ == "__main__":
204
+ import os
205
+
206
+ data_path = Path(__file__).parent / "data"
207
+
208
+ # Initialize components
209
+ glossary_path = data_path / "84000_glossary.json"
210
+ checker = GlossaryChecker(glossary_path)
211
+ validator = TranslationValidator(checker, os.getenv("ANTHROPIC_API_KEY"))
212
+
213
+ # Process aligned file
214
+ aligned_file = data_path / "example_translations.txt"
215
+ results = validator.validate_translation(aligned_file)
216
+
217
+ # Save results
218
+ validator.save_results(results, data_path / "validation_results.json")
219
+
220
+ print("Validation completed. Results saved to 'data/validation_results.json'.")
221
+