DWizard commited on
Commit
d438792
·
1 Parent(s): 9782df8

spellchecker(enchant) added, known bug(line 206)

Browse files

Former-commit-id: 5dea602712d88bf2f0a49321497f5575a86beede

SRT.py CHANGED
@@ -2,7 +2,6 @@ from datetime import timedelta
2
  import os
3
  import whisper
4
  from csv import reader
5
- import re
6
 
7
  class SRT_segment(object):
8
  def __init__(self, *args) -> None:
@@ -20,12 +19,12 @@ class SRT_segment(object):
20
  self.end_time_str = str(0)+end_time.split('.')[0]+',000'
21
  else:
22
  self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
23
- self.source_text = segment['text'][1:]
24
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
25
  self.translation = ""
26
 
27
  elif isinstance(args[0], list):
28
- self.source_text = args[0][2][:-1]
29
  self.duration = args[0][1]
30
  self.start_time_str = self.duration.split(" --> ")[0]
31
  self.end_time_str = self.duration.split(" --> ")[1]
@@ -179,29 +178,91 @@ class SRT_script():
179
  # TODO: variety of translation
180
 
181
  # load term dictionary
182
- with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
183
- csv_reader = reader(f)
184
- term_dict = {rows[0]:rows[1] for rows in csv_reader}
185
 
186
  # change term
187
  for seg in self.segments:
188
- ready_words = re.sub('\n', '\n ', seg.source_text).split(" ")
189
  for i in range(len(ready_words)):
190
  word = ready_words[i]
191
- if word[-2:] == ".\n" :
192
- if word[:-2].lower() in term_dict :
193
- new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
194
- ready_words[i] = new_word
195
- else:
196
- ready_words[i] = word + ' '
197
- elif word.lower() in term_dict :
198
- new_word = word.replace(word,term_dict.get(word.lower())) + ' '
199
- ready_words[i] = new_word
200
- else :
201
- ready_words[i]= word + ' '
202
- seg.source_text = re.sub('\n ', '\n', "".join(ready_words))
 
203
  pass
204
 
205
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  import whisper
4
  from csv import reader
 
5
 
6
  class SRT_segment(object):
7
  def __init__(self, *args) -> None:
 
19
  self.end_time_str = str(0)+end_time.split('.')[0]+',000'
20
  else:
21
  self.end_time_str = str(0)+end_time.split('.')[0]+','+end_time.split('.')[1][:3]
22
+ self.source_text = segment['text']
23
  self.duration = f"{self.start_time_str} --> {self.end_time_str}"
24
  self.translation = ""
25
 
26
  elif isinstance(args[0], list):
27
+ self.source_text = args[0][2]
28
  self.duration = args[0][1]
29
  self.start_time_str = self.duration.split(" --> ")[0]
30
  self.end_time_str = self.duration.split(" --> ")[1]
 
178
  # TODO: variety of translation
179
 
180
  # load term dictionary
181
+ # with open("dict_enzh.csv",'r', encoding='utf-8') as f:
182
+ # csv_reader = reader(f)
183
+ # term_enzh_dict = {rows[0]:rows[1] for rows in csv_reader}
184
 
185
  # change term
186
  for seg in self.segments:
187
+ ready_words = seg.source_text.split(" ")
188
  for i in range(len(ready_words)):
189
  word = ready_words[i]
190
+ ready_words[i] = self.spell_correction(word, 0)
191
+ # if word[-2:] == ".\n":
192
+ # if word[:-2].lower() in term_enzh_dict:
193
+ # new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
194
+ # ready_words[i] = new_word
195
+ # else:
196
+ # ready_words[i] = word
197
+ # elif word.lower() in term_enzh_dict:
198
+ # new_word = word.replace(word,term_enzh_dict.get(word.lower()))
199
+ # ready_words[i] = new_word
200
+ # else:
201
+ # ready_words[i]= word
202
+ seg.source_text = " ".join(ready_words)
203
  pass
204
 
205
+ def spell_check_term(self):
206
+ ## known bug: I've will be replaced because i've is not in the dict
207
+
208
+
209
+ # import enchant
210
+ # dict = enchant.Dict('en_US')
211
+ # term_spellDict = enchant.PyPWL('project-t/finetune_data/dict_freq.txt')
212
+
213
+ for seg in self.segments:
214
+ ready_words = seg.source_text.split(" ")
215
+ for i in range(len(ready_words)):
216
+ word = ready_words[i]
217
+ ready_words[i] = self.spell_correction(word, 1)
218
+ # if word[-2:] == ".\n":
219
+ # real_word = word[:-2]
220
+ # if not dict.check(real_word.lower()):
221
+ # new_word = word.replace(word[:-2], term_spellDict.suggest(real_word.lower())[0])
222
+ # ready_words[i] = new_word
223
+ # elif word[-1:] in [".", "\n", ","]:
224
+ # real_word = word[:-1]
225
+ # if not dict.check(real_word.lower()):
226
+ # new_word = word.replace(word[:-1], term_spellDict.suggest(real_word.lower())[0])
227
+ # ready_words[i] = new_word
228
+ # elif not dict.check(word.lower()):
229
+ # new_word = word.replace(word,term_spellDict.suggest(word.lower())[0])
230
+ # ready_words[i] = new_word
231
+ seg.source_text = " ".join(ready_words)
232
+ pass
233
+
234
+ def spell_correction(self, word:str, arg:int):
235
+ try:
236
+ arg in [0,1]
237
+ except ValueError:
238
+ print('only 0 or 1 for argument')
239
 
240
+
241
+ def uncover(word:str):
242
+ if word[-2:] == ".\n":
243
+ real_word = word[:-2].lower()
244
+ n = -2
245
+ elif word[-1:] in [".", "\n", ",", "!", "?"]:
246
+ real_word = word[:-1].lower()
247
+ n = -1
248
+ else:
249
+ real_word = word.lower()
250
+ n = 0
251
+ return real_word, len(word)+n
252
+
253
+ real_word = uncover(word)[0]
254
+ pos = uncover(word)[1]
255
+ new_word = word
256
+ if arg == 0: # term translate mode
257
+ with open("finetune_data/dict_enzh.csv",'r', encoding='utf-8') as f:
258
+ term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
259
+ if real_word in term_enzh_dict:
260
+ new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
261
+ elif arg == 1: # term spell check mode
262
+ import enchant
263
+ dict = enchant.Dict('en_US')
264
+ term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
265
+ if not dict.check(real_word):
266
+ if term_spellDict.suggest(real_word): # relax spell check
267
+ new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
268
+ return new_word
finetune_data/{dict.csv → dict_enzh.csv} RENAMED
@@ -1,4 +1,4 @@
1
- barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
 
1
+ barracks,兵营
2
  engineering bay,工程站
3
  forge,锻炉
4
  blink,闪现
finetune_data/dict_freq.csv ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks,1
2
+ engineering bay,1
3
+ forge,1
4
+ blink,1
5
+ evolution chamber,1
6
+ cybernetics core,1
7
+ enhanced shockwaves,1
8
+ gravitic boosters,1
9
+ armory,1
10
+ robotics bay,1
11
+ twilight council,1
12
+ fusion core,1
13
+ fleet beacon,1
14
+ factory,1
15
+ ghost academy,1
16
+ infestation pit,1
17
+ robotics facility,1
18
+ stargate,1
19
+ starport,1
20
+ archon,1
21
+ smart servos,1
22
+ gateway,1
23
+ warpgate,1
24
+ immortal,1
25
+ zealot,1
26
+ nydus network,1
27
+ nydus worm,1
28
+ hydralisk,1
29
+ grooved spines,1
30
+ muscular augments,1
31
+ hydralisk den,1
32
+ planetary fortress,1
33
+ battle cruiser,1
34
+ weapon refit,1
35
+ brood lord,1
36
+ greater spire,1
37
+ anabolic synthesis,1
38
+ cyclone,1
39
+ bunker,1
40
+ lurker,1
41
+ seismic spines,1
42
+ adaptive talons,1
43
+ lurker den,1
44
+ widow mine,1
45
+ ground carapace,1
46
+ high templar,1
47
+ shield battery,1
48
+ observer,1
49
+ baneling,1
50
+ centrifugal hooks,1
51
+ baneling nest,1
52
+ raven,1
53
+ combat shield,1
54
+ shield,1
55
+ lair,1
56
+ missile turret,1
57
+ spore crawler,1
58
+ supply depot,1
59
+ overlord,1
60
+ pneumatized carapace,1
61
+ mutalisk,1
62
+ spire,1
63
+ viper,1
64
+ flyer attacks,1
65
+ flyer carapace,1
66
+ tempest,1
67
+ tectonic destabilizers,1
68
+ phoenix,1
69
+ anion pulse-crystals,1
70
+ corruptor,1
71
+ infestor,1
72
+ pathogen glands,1
73
+ zergling,1
74
+ spawning pool,1
75
+ metabolic boost,1
76
+ spine crawler,1
77
+ marauder,1
78
+ ghost,1
79
+ arm silo with nuke,1
80
+ carrier,1
81
+ hellion,1
82
+ hellbat,1
83
+ ravager,1
84
+ nexus,1
85
+ hatchery,1
86
+ command center,1
87
+ neosteel armor,1
88
+ hi-sec auto tracking,1
89
+ ship weapons,1
90
+ charge,1
91
+ liberator,1
92
+ advanced ballistics,1
93
+ melee attacks,1
94
+ colossus,1
95
+ extended thermal lance,1
96
+ creep tumor,1
97
+ tech lab,1
98
+ air armor,1
99
+ air weapons,1
100
+ adrenal glands,1
101
+ mule,1
102
+ infernal pre-igniter,1
103
+ thor,1
104
+ warp prism,1
105
+ gravitic drive,1
106
+ dragoon,1
107
+ cocoon,1
108
+ larva,1
109
+ mothership,1
110
+ burrow,1
111
+ changeling,1
112
+ ultralisk,1
113
+ chitinous plating,1
114
+ ultralisk cavern,1
115
+ drone,1
116
+ scv,1
117
+ queen,1
118
+ banshee,1
119
+ hyperflight rotors,1
120
+ photon cannon,1
121
+ missile attacks,1
122
+ assimilator,1
123
+ extractor,1
124
+ refinery,1
125
+ roach,1
126
+ marine,1
127
+ sensor tower,1
128
+ infantry armor,1
129
+ infantry weapons,1
130
+ hive,1
131
+ psionic storm,1
132
+ templar archives,1
133
+ sentry,1
134
+ ground armor,1
135
+ ground weapons,1
136
+ adept,1
137
+ resonating glaives,1
138
+ reactor,1
139
+ pylon,1
140
+ reaper,1
141
+ drilling claws,1
142
+ swarm host,1
143
+ mag-field accelerator,1
144
+ siege tank,1
145
+ probe,1
146
+ corvid reactor,1
147
+ neural parasite,1
148
+ viking,1
149
+ oracle,1
150
+ broodling,1
151
+ locust,1
152
+ mothership core,1
153
+ orbital command,1
154
+ stimpack,1
155
+ void ray,1
156
+ flux vanes,1
157
+ overseer,1
158
+ ignite afterburners,1
159
+ dark templar,1
160
+ shadow stride,1
161
+ dark shrine,1
162
+ cloaking field,1
163
+ personal cloaking,1
164
+ medivac dropship,1
165
+ vehicle and ship plating,1
166
+ vehicle weapons,1
167
+ war hound,1
168
+ roach warren,1
169
+ tunneling claws,1
170
+ glial reconstitution,1
171
+ concussive shells,1
172
+ stalker,1
173
+ disruptor,1
174
+ zerg,1
175
+ protross,1
176
+ terran,1
finetune_data/dict_freq.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks
2
+ engineering bay
3
+ forge
4
+ blink
5
+ evolution chamber
6
+ cybernetics core
7
+ enhanced shockwaves
8
+ gravitic boosters
9
+ armory
10
+ robotics bay
11
+ twilight council
12
+ fusion core
13
+ fleet beacon
14
+ factory
15
+ ghost academy
16
+ infestation pit
17
+ robotics facility
18
+ stargate
19
+ starport
20
+ archon
21
+ smart servos
22
+ gateway
23
+ warpgate
24
+ immortal
25
+ zealot
26
+ nydus network
27
+ nydus worm
28
+ hydralisk
29
+ grooved spines
30
+ muscular augments
31
+ hydralisk den
32
+ planetary fortress
33
+ battle cruiser
34
+ weapon refit
35
+ brood lord
36
+ greater spire
37
+ anabolic synthesis
38
+ cyclone
39
+ bunker
40
+ lurker
41
+ seismic spines
42
+ adaptive talons
43
+ lurker den
44
+ widow mine
45
+ ground carapace
46
+ high templar
47
+ shield battery
48
+ observer
49
+ baneling
50
+ centrifugal hooks
51
+ baneling nest
52
+ raven
53
+ combat shield
54
+ shield
55
+ lair
56
+ missile turret
57
+ spore crawler
58
+ supply depot
59
+ overlord
60
+ pneumatized carapace
61
+ mutalisk
62
+ spire
63
+ viper
64
+ flyer attacks
65
+ flyer carapace
66
+ tempest
67
+ tectonic destabilizers
68
+ phoenix
69
+ anion pulse-crystals
70
+ corruptor
71
+ infestor
72
+ pathogen glands
73
+ zergling
74
+ spawning pool
75
+ metabolic boost
76
+ spine crawler
77
+ marauder
78
+ ghost
79
+ arm silo with nuke
80
+ carrier
81
+ hellion
82
+ hellbat
83
+ ravager
84
+ nexus
85
+ hatchery
86
+ command center
87
+ neosteel armor
88
+ hi-sec auto tracking
89
+ ship weapons
90
+ charge
91
+ liberator
92
+ advanced ballistics
93
+ melee attacks
94
+ colossus
95
+ extended thermal lance
96
+ creep tumor
97
+ tech lab
98
+ air armor
99
+ air weapons
100
+ adrenal glands
101
+ mule
102
+ infernal pre-igniter
103
+ thor
104
+ warp prism
105
+ gravitic drive
106
+ dragoon
107
+ cocoon
108
+ larva
109
+ mothership
110
+ burrow
111
+ changeling
112
+ ultralisk
113
+ chitinous plating
114
+ ultralisk cavern
115
+ drone
116
+ scv
117
+ queen
118
+ banshee
119
+ hyperflight rotors
120
+ photon cannon
121
+ missile attacks
122
+ assimilator
123
+ extractor
124
+ refinery
125
+ roach
126
+ marine
127
+ sensor tower
128
+ infantry armor
129
+ infantry weapons
130
+ hive
131
+ psionic storm
132
+ templar archives
133
+ sentry
134
+ ground armor
135
+ ground weapons
136
+ adept
137
+ resonating glaives
138
+ reactor
139
+ pylon
140
+ reaper
141
+ drilling claws
142
+ swarm host
143
+ mag-field accelerator
144
+ siege tank
145
+ probe
146
+ corvid reactor
147
+ neural parasite
148
+ viking
149
+ oracle
150
+ broodling
151
+ locust
152
+ mothership core
153
+ orbital command
154
+ stimpack
155
+ void ray
156
+ flux vanes
157
+ overseer
158
+ ignite afterburners
159
+ dark templar
160
+ shadow stride
161
+ dark shrine
162
+ cloaking field
163
+ personal cloaking
164
+ medivac dropship
165
+ vehicle and ship plating
166
+ vehicle weapons
167
+ war hound
168
+ roach warren
169
+ tunneling claws
170
+ glial reconstitution
171
+ concussive shells
172
+ stalker
173
+ disruptor
174
+ zerg
175
+ protross
176
+ terran
177
+ starcraft