Spaces:
Runtime error
Runtime error
from __future__ import absolute_import, unicode_literals | |
import re | |
import os | |
import sys | |
import pickle | |
from .._compat import * | |
MIN_FLOAT = -3.14e100 | |
PROB_START_P = "prob_start.p" | |
PROB_TRANS_P = "prob_trans.p" | |
PROB_EMIT_P = "prob_emit.p" | |
PrevStatus = { | |
'B': 'ES', | |
'M': 'MB', | |
'S': 'SE', | |
'E': 'BM' | |
} | |
def load_model(): | |
start_p = pickle.load(get_module_res("finalseg", PROB_START_P)) | |
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P)) | |
emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P)) | |
return start_p, trans_p, emit_p | |
if sys.platform.startswith("java"): | |
start_P, trans_P, emit_P = load_model() | |
else: | |
from .prob_start import P as start_P | |
from .prob_trans import P as trans_P | |
from .prob_emit import P as emit_P | |
def viterbi(obs, states, start_p, trans_p, emit_p): | |
V = [{}] # tabular | |
path = {} | |
for y in states: # init | |
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT) | |
path[y] = [y] | |
for t in xrange(1, len(obs)): | |
V.append({}) | |
newpath = {} | |
for y in states: | |
em_p = emit_p[y].get(obs[t], MIN_FLOAT) | |
(prob, state) = max( | |
[(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]]) | |
V[t][y] = prob | |
newpath[y] = path[state] + [y] | |
path = newpath | |
(prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES') | |
return (prob, path[state]) | |
def __cut(sentence): | |
global emit_P | |
prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P) | |
begin, nexti = 0, 0 | |
# print pos_list, sentence | |
for i, char in enumerate(sentence): | |
pos = pos_list[i] | |
if pos == 'B': | |
begin = i | |
elif pos == 'E': | |
yield sentence[begin:i + 1] | |
nexti = i + 1 | |
elif pos == 'S': | |
yield char | |
nexti = i + 1 | |
if nexti < len(sentence): | |
yield sentence[nexti:] | |
re_han = re.compile("([\u4E00-\u9FD5]+)") | |
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") | |
def cut(sentence): | |
sentence = strdecode(sentence) | |
blocks = re_han.split(sentence) | |
for blk in blocks: | |
if re_han.match(blk): | |
for word in __cut(blk): | |
yield word | |
else: | |
tmp = re_skip.split(blk) | |
for x in tmp: | |
if x: | |
yield x | |