import pickle model_data = pickle.load(open('gib_model.pki', 'rb')) import math import pickle accepted_chars = 'abcdefghijklmnopqrstuvwxyz ' pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)]) def normalize(line): """ Return only the subset of chars from accepted_chars. This helps keep the model relatively small by ignoring punctuation, infrequently symbols, etc. """ return [c.lower() for c in line if c.lower() in accepted_chars] def ngram(n, l): """ Return all n grams from l after normalizing """ filtered = normalize(l) for start in range(0, len(filtered) - n + 1): yield ''.join(filtered[start:start + n]) def get_lines(): datasets = ['big.txt'] for filename in datasets: with open(filename) as fp: for line in fp: yield line def avg_transition_prob(l, log_prob_mat): """ Return the average transition prob from l through log_prob_mat. """ log_prob = 0.0 transition_ct = 0 for a, b, c in ngram(3, l): log_prob += log_prob_mat[pos[a]][pos[b]][pos[c]] transition_ct += 1 # The exponentiation translates from log probs to probs. return math.exp(log_prob / (transition_ct or 1)) # The exponentiation translates from log probs to probs. return math.exp(log_prob / (transition_ct or 1)) while True: l = st.text_area('enter a prospection message') model_mat = model_data['mat'] threshold = model_data['thresh'] st.write(avg_transition_prob(l, model_mat) > threshold)