DWizard commited on
Commit
5a7c441
·
1 Parent(s): 6d9ba90

add force change term into chinese before tranlate

Browse files

Former-commit-id: 2b6ec94ed31ea352361c1591a6db9a4b3e775fb3

Files changed (2) hide show
  1. finetune_data/dict.csv +173 -0
  2. pipeline.py +45 -3
finetune_data/dict.csv ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ barracks,兵营
2
+ engineering bay,工程站
3
+ forge,锻炉
4
+ blink,闪现
5
+ evolution chamber,进化腔
6
+ cybernetics core,控制芯核
7
+ enhanced shockwaves,EMP范围
8
+ gravitic boosters,ob速度
9
+ armory,军械库
10
+ robotics bay,机械研究所
11
+ twilight council,光影议会
12
+ fusion core,聚变芯体
13
+ fleet beacon,舰队航标
14
+ factory,重工厂
15
+ ghost academy,幽灵军校
16
+ infestation pit,感染深渊
17
+ robotics facility,机械台
18
+ stargate,星门
19
+ starport,星港
20
+ archon,白球
21
+ smart servos,变形加速
22
+ gateway,兵营
23
+ warpgate,兵营
24
+ immortal,不朽
25
+ zealot,叉叉
26
+ nydus network,虫洞
27
+ nydus worm,虫洞
28
+ hydralisk,刺蛇
29
+ grooved spines,刺蛇射程
30
+ muscular augments,刺蛇速度
31
+ hydralisk den,刺蛇塔
32
+ planetary fortress,大地堡
33
+ battle cruiser,大和
34
+ weapon refit,大和炮
35
+ brood lord,大龙
36
+ greater spire,大龙塔
37
+ anabolic synthesis,大牛速度
38
+ cyclone,导弹车
39
+ bunker,地堡
40
+ lurker,地刺
41
+ seismic spines,地刺射程
42
+ adaptive talons,地刺速埋
43
+ lurker den,地刺塔
44
+ widow mine,地雷
45
+ ground carapace,地面单位甲壳等级
46
+ high templar,电兵
47
+ shield battery,电池
48
+ observer,叮当
49
+ baneling,毒爆
50
+ centrifugal hooks,毒爆速度
51
+ baneling nest,毒爆塔
52
+ raven,渡鸦
53
+ combat shield,盾
54
+ shield,盾
55
+ lair,二本
56
+ missile turret,防空
57
+ spore crawler,防空
58
+ supply depot,房子
59
+ overlord,房子
60
+ pneumatized carapace,房子速度
61
+ mutalisk,飞龙
62
+ spire,飞龙塔
63
+ viper,飞蛇
64
+ flyer attacks,飞行生物攻击等级
65
+ flyer carapace,飞行生物甲壳等级
66
+ tempest,风暴
67
+ tectonic destabilizers,风暴伤害
68
+ phoenix,凤凰
69
+ anion pulse-crystals,凤凰射程
70
+ corruptor,腐化
71
+ infestor,感染虫
72
+ pathogen glands,感染能量
73
+ zergling,狗
74
+ spawning pool,狗池
75
+ metabolic boost,狗速
76
+ spine crawler,管子
77
+ marauder,光头
78
+ ghost,鬼兵
79
+ arm silo with nuke,核弹
80
+ carrier,黄金舰队
81
+ hellion,火车
82
+ hellbat,火车侠
83
+ ravager,火蟑螂
84
+ nexus,基地
85
+ hatchery,基地
86
+ command center,基地
87
+ neosteel armor,建筑护甲
88
+ hi-sec auto tracking,建筑射程
89
+ ship weapons,舰船武器等级
90
+ charge,脚速
91
+ liberator,解放
92
+ advanced ballistics,解放射程
93
+ melee attacks,近战攻击等级
94
+ colossus,巨像
95
+ extended thermal lance,巨像射程
96
+ creep tumor,菌毯
97
+ tech lab,科技挂件
98
+ air armor,空中单位护甲等级
99
+ air weapons,空中单位武器等级
100
+ adrenal glands,狂狗
101
+ mule,矿螺
102
+ infernal pre-igniter,蓝火
103
+ thor,雷神
104
+ warp prism,棱镜
105
+ gravitic drive,棱镜速度
106
+ dragoon,龙骑士
107
+ cocoon,卵
108
+ larva,卵
109
+ mothership,妈妈船
110
+ burrow,埋地
111
+ changeling,拟态虫
112
+ ultralisk,牛
113
+ chitinous plating,牛甲
114
+ ultralisk cavern,牛塔
115
+ drone,农民
116
+ scv,农民
117
+ queen,女王
118
+ banshee,女妖
119
+ hyperflight rotors,女妖提速
120
+ photon cannon,炮台
121
+ missile attacks,喷射攻击等级
122
+ assimilator,气矿
123
+ extractor,气矿
124
+ refinery,气矿
125
+ roach,钱赞企
126
+ marine,枪兵
127
+ sensor tower,圈
128
+ infantry armor,人族防
129
+ infantry weapons,人族攻
130
+ hive,三本
131
+ psionic storm,闪电
132
+ templar archives,闪电塔
133
+ sentry,哨兵
134
+ ground armor,神族防
135
+ ground weapons,神族攻
136
+ adept,使徒
137
+ resonating glaives,使徒攻速
138
+ reactor,双倍挂件
139
+ pylon,水晶
140
+ reaper,死神
141
+ drilling claws,速埋
142
+ swarm host,宿主
143
+ mag-field accelerator,锁定增伤
144
+ siege tank,坦克
145
+ probe,探机
146
+ corvid reactor,铁鸦能量
147
+ neural parasite,同化完成
148
+ viking,维京
149
+ oracle,先知
150
+ broodling,小虫子
151
+ locust,小虫子
152
+ mothership core,小妈妈船
153
+ orbital command,星轨
154
+ stimpack,兴奋剂
155
+ void ray,虚空
156
+ flux vanes,虚空速度
157
+ overseer,眼虫
158
+ ignite afterburners,医疗机速度
159
+ dark templar,隐刀
160
+ shadow stride,隐刀闪现
161
+ dark shrine,隐刀塔
162
+ cloaking field,隐形
163
+ personal cloaking,隐形
164
+ medivac dropship,运输机
165
+ vehicle and ship plating,战车及舰船钢板等级
166
+ vehicle weapons,战车武器等级
167
+ war hound,战狼
168
+ roach warren,蟑螂巢
169
+ tunneling claws,蟑螂埋地
170
+ glial reconstitution,蟑螂速度
171
+ concussive shells,震撼弹
172
+ stalker,追猎
173
+ disruptor,自爆球
pipeline.py CHANGED
@@ -89,7 +89,7 @@ if not os.path.exists(f'{RESULT_PATH}/{VIDEO_NAME}'):
89
  # Instead of using the script_en variable directly, we'll use script_input
90
  srt_file_en = args.srt_file
91
  if srt_file_en is not None:
92
- with open(srt_file_en, 'r') as f:
93
  script_input = f.read()
94
  else:
95
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
@@ -110,7 +110,7 @@ else:
110
  writer.write_result(transcript, srt)
111
 
112
  # split the video script(open ai prompt limit: about 5000)
113
- with open(srt_file_en, 'r') as f:
114
  script_en = f.read()
115
  script_input = script_en
116
 
@@ -119,9 +119,51 @@ if not args.only_srt:
119
  assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
120
  print('ASS subtitle saved as: ' + assSub_en)
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Split the video script by sentences and create chunks within the token limit
123
  n_threshold = 1500 # Token limit for the GPT-3 model
124
- script_split = script_input.split('.')
125
 
126
  script_arr = []
127
  script = ""
 
89
  # Instead of using the script_en variable directly, we'll use script_input
90
  srt_file_en = args.srt_file
91
  if srt_file_en is not None:
92
+ with open(srt_file_en, 'r', encoding='utf-8') as f:
93
  script_input = f.read()
94
  else:
95
  # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
 
110
  writer.write_result(transcript, srt)
111
 
112
  # split the video script(open ai prompt limit: about 5000)
113
+ with open(srt_file_en, 'r', encoding='utf-8') as f:
114
  script_en = f.read()
115
  script_input = script_en
116
 
 
119
  assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
120
  print('ASS subtitle saved as: ' + assSub_en)
121
 
122
+ # force translate the starcraft2 term into chinese according to the dict
123
+ # TODO: shortcut translation i.e. VA, ob
124
+ # TODO: variety of translation
125
+ from csv import reader
126
+ import re
127
+
128
+ # read dict
129
+ with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
130
+ csv_reader = reader(f)
131
+ term_dict = {rows[0]:rows[1] for rows in csv_reader}
132
+
133
+ def clean_timestamp(lines):
134
+ new_lines = []
135
+ strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}') # 注意用4个\\\\来替换\
136
+ new_lines = strinfo.sub('_-_', lines)
137
+ print(new_lines)
138
+ return new_lines
139
+
140
+
141
+ ready_lines = re.sub('\n', '\n ', script_input)
142
+ ready_words = ready_lines.split(" ")
143
+ i = 0
144
+ while i < len(ready_words):
145
+ word = ready_words[i]
146
+ if word[-2:] == ".\n" :
147
+ if word[:-2].lower() in term_dict :
148
+ new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
149
+ ready_words[i] = new_word
150
+ else :
151
+ word += ' '
152
+ ready_words[i] = word
153
+ elif word.lower() in term_dict :
154
+ new_word = word.replace(word,term_dict.get(word.lower())) + ' '
155
+ ready_words[i] = new_word
156
+ else :
157
+ word += " "
158
+ ready_words[i]= word
159
+ i += 1
160
+
161
+ script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
162
+
163
+
164
  # Split the video script by sentences and create chunks within the token limit
165
  n_threshold = 1500 # Token limit for the GPT-3 model
166
+ script_split = script_input_withForceTerm.split('.')
167
 
168
  script_arr = []
169
  script = ""