nickmuchi commited on
Commit
74f896e
·
1 Parent(s): 08e6e30

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +62 -56
functions.py CHANGED
@@ -94,6 +94,8 @@ initial_qa_template = (
94
  "answer the question: {question}\n.\n"
95
  )
96
 
 
 
97
  @st.experimental_singleton(suppress_st_warning=True)
98
  def load_models():
99
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
@@ -116,15 +118,27 @@ def load_asr_model(asr_model_name):
116
 
117
  return asr_model
118
 
119
- @st.experimental_singleton(suppress_st_warning=True)
120
- def load_sbert(model_name):
121
- if 'hkunlp' in model_name:
122
- sbert = INSTRUCTOR(model_name)
123
- else:
124
- sbert = SentenceTransformer(model_name)
 
 
 
 
 
125
 
126
- return sbert
 
 
 
 
 
127
 
 
 
128
  @st.experimental_memo(suppress_st_warning=True)
129
  def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
130
 
@@ -142,8 +156,6 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
142
 
143
  embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
144
 
145
-
146
-
147
  docsearch = Pinecone.from_texts(
148
  corpus,
149
  embeddings,
@@ -177,61 +189,61 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
177
 
178
  return hits
179
 
180
- @st.experimental_memo(suppress_st_warning=True)
181
- def embed_text(query,corpus,embedding_model):
182
 
183
- '''Embed text and generate semantic search scores'''
184
 
185
- #If model is e5 then apply prefixes to query and passage
186
- if embedding_model == 'intfloat/e5-base':
187
- search_input = 'query: '+ query
188
- passages_emb = ['passage: ' + sentence for sentence in corpus]
189
 
190
- elif embedding_model == 'hkunlp/instructor-base':
191
- search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
192
- passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
193
 
194
- else:
195
- search_input = query
196
- passages_emb = corpus
197
 
198
 
199
- #Embed corpus and question
200
- corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
201
- question_embedding = sbert.encode(search_input, convert_to_tensor=True)
202
- question_embedding = question_embedding.cpu()
203
- corpus_embedding = corpus_embedding.cpu()
204
 
205
- # #Calculate similarity scores and rank
206
- hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
207
- hits = hits[0] # Get the hits for the first query
208
 
209
- # ##### Re-Ranking #####
210
- # Now, score all retrieved passages with the cross_encoder
211
- cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
212
 
213
- if embedding_model == 'hkunlp/instructor-base':
214
- result = []
215
 
216
- for sublist in cross_inp:
217
- question = sublist[0][0][1]
218
- document = sublist[1][1]
219
- result.append([question, document])
220
 
221
- cross_inp = result
222
 
223
- cross_scores = cross_encoder.predict(cross_inp)
224
 
225
- # Sort results by the cross-encoder scores
226
- for idx in range(len(cross_scores)):
227
- hits[idx]['cross-score'] = cross_scores[idx]
228
 
229
- # Output of top-3 hits from re-ranker
230
- # st.markdown("\n-------------------------\n")
231
- # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
232
- hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
233
 
234
- return hits
235
 
236
  @st.experimental_singleton(suppress_st_warning=True)
237
  def get_spacy():
@@ -366,12 +378,6 @@ def get_all_entities_per_sentence(text):
366
  for entity in sentence.ents:
367
  entities_this_sentence.append(str(entity))
368
 
369
- # FLAIR ENTITIES (CURRENTLY NOT USED)
370
- # sentence_entities = Sentence(str(sentence))
371
- # tagger.predict(sentence_entities)
372
- # for entity in sentence_entities.get_spans('ner'):
373
- # entities_this_sentence.append(entity.text)
374
-
375
  # XLM ENTITIES
376
  entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
377
  for entity in entities_xlm:
@@ -802,5 +808,5 @@ def save_network_html(kb, filename="network.html"):
802
 
803
 
804
  nlp = get_spacy()
805
- sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer = load_models()
806
  sbert = load_sbert('all-MiniLM-L12-v2')
 
94
  "answer the question: {question}\n.\n"
95
  )
96
 
97
+
98
+
99
  @st.experimental_singleton(suppress_st_warning=True)
100
  def load_models():
101
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
 
118
 
119
  return asr_model
120
 
121
+ # @st.experimental_singleton(suppress_st_warning=True)
122
+ # def load_sbert(model_name):
123
+ # if 'hkunlp' in model_name:
124
+ # sbert = INSTRUCTOR(model_name)
125
+ # else:
126
+ # sbert = SentenceTransformer(model_name)
127
+
128
+ # return sbert
129
+
130
+ @st.experimental_singleton(suppress_st_warning=True)
131
+ def process_corpus(corpus, tok, chunk_size=200, overlap=50):
132
 
133
+ pinecone.init(api_key="2d1e8029-2d84-4724-9f7c-a4f0f5ae908a", environment="us-west1-gcp")
134
+
135
+ tokenizer = tok
136
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=overlap,separator='. ')
137
+
138
+ texts = text_splitter.split_text(corpus)
139
 
140
+ return texts
141
+
142
  @st.experimental_memo(suppress_st_warning=True)
143
  def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
144
 
 
156
 
157
  embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
158
 
 
 
159
  docsearch = Pinecone.from_texts(
160
  corpus,
161
  embeddings,
 
189
 
190
  return hits
191
 
192
+ # @st.experimental_memo(suppress_st_warning=True)
193
+ # def embed_text(query,corpus,embedding_model):
194
 
195
+ # '''Embed text and generate semantic search scores'''
196
 
197
+ # #If model is e5 then apply prefixes to query and passage
198
+ # if embedding_model == 'intfloat/e5-base':
199
+ # search_input = 'query: '+ query
200
+ # passages_emb = ['passage: ' + sentence for sentence in corpus]
201
 
202
+ # elif embedding_model == 'hkunlp/instructor-base':
203
+ # search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
204
+ # passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
205
 
206
+ # else:
207
+ # search_input = query
208
+ # passages_emb = corpus
209
 
210
 
211
+ # #Embed corpus and question
212
+ # corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
213
+ # question_embedding = sbert.encode(search_input, convert_to_tensor=True)
214
+ # question_embedding = question_embedding.cpu()
215
+ # corpus_embedding = corpus_embedding.cpu()
216
 
217
+ # # #Calculate similarity scores and rank
218
+ # hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
219
+ # hits = hits[0] # Get the hits for the first query
220
 
221
+ # # ##### Re-Ranking #####
222
+ # # Now, score all retrieved passages with the cross_encoder
223
+ # cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
224
 
225
+ # if embedding_model == 'hkunlp/instructor-base':
226
+ # result = []
227
 
228
+ # for sublist in cross_inp:
229
+ # question = sublist[0][0][1]
230
+ # document = sublist[1][1]
231
+ # result.append([question, document])
232
 
233
+ # cross_inp = result
234
 
235
+ # cross_scores = cross_encoder.predict(cross_inp)
236
 
237
+ # # Sort results by the cross-encoder scores
238
+ # for idx in range(len(cross_scores)):
239
+ # hits[idx]['cross-score'] = cross_scores[idx]
240
 
241
+ # # Output of top-3 hits from re-ranker
242
+ # # st.markdown("\n-------------------------\n")
243
+ # # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
244
+ # hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
245
 
246
+ # return hits
247
 
248
  @st.experimental_singleton(suppress_st_warning=True)
249
  def get_spacy():
 
378
  for entity in sentence.ents:
379
  entities_this_sentence.append(str(entity))
380
 
 
 
 
 
 
 
381
  # XLM ENTITIES
382
  entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
383
  for entity in entities_xlm:
 
808
 
809
 
810
  nlp = get_spacy()
811
+ sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer = load_models()
812
  sbert = load_sbert('all-MiniLM-L12-v2')