Update functions.py
Browse files- functions.py +62 -56
functions.py
CHANGED
@@ -94,6 +94,8 @@ initial_qa_template = (
|
|
94 |
"answer the question: {question}\n.\n"
|
95 |
)
|
96 |
|
|
|
|
|
97 |
@st.experimental_singleton(suppress_st_warning=True)
|
98 |
def load_models():
|
99 |
q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
|
@@ -116,15 +118,27 @@ def load_asr_model(asr_model_name):
|
|
116 |
|
117 |
return asr_model
|
118 |
|
119 |
-
@st.experimental_singleton(suppress_st_warning=True)
|
120 |
-
def load_sbert(model_name):
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
127 |
|
|
|
|
|
128 |
@st.experimental_memo(suppress_st_warning=True)
|
129 |
def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
|
130 |
|
@@ -142,8 +156,6 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
|
|
142 |
|
143 |
embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
|
144 |
|
145 |
-
|
146 |
-
|
147 |
docsearch = Pinecone.from_texts(
|
148 |
corpus,
|
149 |
embeddings,
|
@@ -177,61 +189,61 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
|
|
177 |
|
178 |
return hits
|
179 |
|
180 |
-
@st.experimental_memo(suppress_st_warning=True)
|
181 |
-
def embed_text(query,corpus,embedding_model):
|
182 |
|
183 |
-
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
|
213 |
-
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
|
221 |
-
|
222 |
|
223 |
-
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
|
234 |
-
|
235 |
|
236 |
@st.experimental_singleton(suppress_st_warning=True)
|
237 |
def get_spacy():
|
@@ -366,12 +378,6 @@ def get_all_entities_per_sentence(text):
|
|
366 |
for entity in sentence.ents:
|
367 |
entities_this_sentence.append(str(entity))
|
368 |
|
369 |
-
# FLAIR ENTITIES (CURRENTLY NOT USED)
|
370 |
-
# sentence_entities = Sentence(str(sentence))
|
371 |
-
# tagger.predict(sentence_entities)
|
372 |
-
# for entity in sentence_entities.get_spans('ner'):
|
373 |
-
# entities_this_sentence.append(entity.text)
|
374 |
-
|
375 |
# XLM ENTITIES
|
376 |
entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
|
377 |
for entity in entities_xlm:
|
@@ -802,5 +808,5 @@ def save_network_html(kb, filename="network.html"):
|
|
802 |
|
803 |
|
804 |
nlp = get_spacy()
|
805 |
-
sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer = load_models()
|
806 |
sbert = load_sbert('all-MiniLM-L12-v2')
|
|
|
94 |
"answer the question: {question}\n.\n"
|
95 |
)
|
96 |
|
97 |
+
|
98 |
+
|
99 |
@st.experimental_singleton(suppress_st_warning=True)
|
100 |
def load_models():
|
101 |
q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
|
|
|
118 |
|
119 |
return asr_model
|
120 |
|
121 |
+
# @st.experimental_singleton(suppress_st_warning=True)
|
122 |
+
# def load_sbert(model_name):
|
123 |
+
# if 'hkunlp' in model_name:
|
124 |
+
# sbert = INSTRUCTOR(model_name)
|
125 |
+
# else:
|
126 |
+
# sbert = SentenceTransformer(model_name)
|
127 |
+
|
128 |
+
# return sbert
|
129 |
+
|
130 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
131 |
+
def process_corpus(corpus, tok, chunk_size=200, overlap=50):
|
132 |
|
133 |
+
pinecone.init(api_key="2d1e8029-2d84-4724-9f7c-a4f0f5ae908a", environment="us-west1-gcp")
|
134 |
+
|
135 |
+
tokenizer = tok
|
136 |
+
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=overlap,separator='. ')
|
137 |
+
|
138 |
+
texts = text_splitter.split_text(corpus)
|
139 |
|
140 |
+
return texts
|
141 |
+
|
142 |
@st.experimental_memo(suppress_st_warning=True)
|
143 |
def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
|
144 |
|
|
|
156 |
|
157 |
embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
|
158 |
|
|
|
|
|
159 |
docsearch = Pinecone.from_texts(
|
160 |
corpus,
|
161 |
embeddings,
|
|
|
189 |
|
190 |
return hits
|
191 |
|
192 |
+
# @st.experimental_memo(suppress_st_warning=True)
|
193 |
+
# def embed_text(query,corpus,embedding_model):
|
194 |
|
195 |
+
# '''Embed text and generate semantic search scores'''
|
196 |
|
197 |
+
# #If model is e5 then apply prefixes to query and passage
|
198 |
+
# if embedding_model == 'intfloat/e5-base':
|
199 |
+
# search_input = 'query: '+ query
|
200 |
+
# passages_emb = ['passage: ' + sentence for sentence in corpus]
|
201 |
|
202 |
+
# elif embedding_model == 'hkunlp/instructor-base':
|
203 |
+
# search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
|
204 |
+
# passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
|
205 |
|
206 |
+
# else:
|
207 |
+
# search_input = query
|
208 |
+
# passages_emb = corpus
|
209 |
|
210 |
|
211 |
+
# #Embed corpus and question
|
212 |
+
# corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
|
213 |
+
# question_embedding = sbert.encode(search_input, convert_to_tensor=True)
|
214 |
+
# question_embedding = question_embedding.cpu()
|
215 |
+
# corpus_embedding = corpus_embedding.cpu()
|
216 |
|
217 |
+
# # #Calculate similarity scores and rank
|
218 |
+
# hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
|
219 |
+
# hits = hits[0] # Get the hits for the first query
|
220 |
|
221 |
+
# # ##### Re-Ranking #####
|
222 |
+
# # Now, score all retrieved passages with the cross_encoder
|
223 |
+
# cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
|
224 |
|
225 |
+
# if embedding_model == 'hkunlp/instructor-base':
|
226 |
+
# result = []
|
227 |
|
228 |
+
# for sublist in cross_inp:
|
229 |
+
# question = sublist[0][0][1]
|
230 |
+
# document = sublist[1][1]
|
231 |
+
# result.append([question, document])
|
232 |
|
233 |
+
# cross_inp = result
|
234 |
|
235 |
+
# cross_scores = cross_encoder.predict(cross_inp)
|
236 |
|
237 |
+
# # Sort results by the cross-encoder scores
|
238 |
+
# for idx in range(len(cross_scores)):
|
239 |
+
# hits[idx]['cross-score'] = cross_scores[idx]
|
240 |
|
241 |
+
# # Output of top-3 hits from re-ranker
|
242 |
+
# # st.markdown("\n-------------------------\n")
|
243 |
+
# # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
|
244 |
+
# hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
245 |
|
246 |
+
# return hits
|
247 |
|
248 |
@st.experimental_singleton(suppress_st_warning=True)
|
249 |
def get_spacy():
|
|
|
378 |
for entity in sentence.ents:
|
379 |
entities_this_sentence.append(str(entity))
|
380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
# XLM ENTITIES
|
382 |
entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
|
383 |
for entity in entities_xlm:
|
|
|
808 |
|
809 |
|
810 |
nlp = get_spacy()
|
811 |
+
sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer = load_models()
|
812 |
sbert = load_sbert('all-MiniLM-L12-v2')
|