nickmuchi commited on
Commit
63f91c1
·
1 Parent(s): 96369f4

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +29 -16
functions.py CHANGED
@@ -94,10 +94,12 @@ initial_qa_template = (
94
  "answer the question: {question}\n.\n"
95
  )
96
 
97
-
98
 
99
  @st.experimental_singleton(suppress_st_warning=True)
100
  def load_models():
 
 
101
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
102
  ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
103
  kg_model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
@@ -128,7 +130,9 @@ def load_asr_model(asr_model_name):
128
  # return sbert
129
 
130
  @st.experimental_singleton(suppress_st_warning=True)
131
- def process_corpus(corpus, tok, chunk_size=200, overlap=50):
 
 
132
 
133
  pinecone.init(api_key="2d1e8029-2d84-4724-9f7c-a4f0f5ae908a", environment="us-west1-gcp")
134
 
@@ -137,10 +141,19 @@ def process_corpus(corpus, tok, chunk_size=200, overlap=50):
137
 
138
  texts = text_splitter.split_text(corpus)
139
 
140
- return texts
 
 
 
 
 
 
 
 
 
141
 
142
  @st.experimental_memo(suppress_st_warning=True)
143
- def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
144
 
145
  '''Embed text and generate semantic search scores'''
146
 
@@ -156,15 +169,9 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
156
 
157
  embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
158
 
159
- docsearch = Pinecone.from_texts(
160
- corpus,
161
- embeddings,
162
- index_name = index_id,
163
- namespace = f'{title}-earnings',
164
- metadatas = [
165
- {'source':i} for i in range(len(texts))]
166
- )
167
-
168
  docs = docsearch.similarity_search_with_score(query, k=3, namespace = f'{title}-earnings')
169
 
170
  docs = [d[0] for d in docs]
@@ -186,8 +193,14 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
186
 
187
  elif chain_type == 'refine':
188
 
189
-
190
- return hits
 
 
 
 
 
 
191
 
192
  # @st.experimental_memo(suppress_st_warning=True)
193
  # def embed_text(query,corpus,embedding_model):
@@ -304,7 +317,7 @@ def clean_text(text):
304
 
305
  @st.experimental_memo(suppress_st_warning=True)
306
  def chunk_long_text(text,threshold,window_size=3,stride=2):
307
- '''Preprocess text and chunk for semantic search and sentiment analysis'''
308
 
309
  #Convert cleaned text into sentences
310
  sentences = sent_tokenize(text)
 
94
  "answer the question: {question}\n.\n"
95
  )
96
 
97
+ ###################### Functions #######################################################################################
98
 
99
  @st.experimental_singleton(suppress_st_warning=True)
100
  def load_models():
101
+
102
+ '''Load and cache all the models to be used'''
103
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
104
  ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
105
  kg_model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
 
130
  # return sbert
131
 
132
  @st.experimental_singleton(suppress_st_warning=True)
133
+ def process_corpus(corpus, tok, title, embeddings, chunk_size=200, overlap=50):
134
+
135
+ '''Process text for Semantic Search'''
136
 
137
  pinecone.init(api_key="2d1e8029-2d84-4724-9f7c-a4f0f5ae908a", environment="us-west1-gcp")
138
 
 
141
 
142
  texts = text_splitter.split_text(corpus)
143
 
144
+ docsearch = Pinecone.from_texts(
145
+ texts,
146
+ embeddings,
147
+ index_name = index_id,
148
+ namespace = f'{title}-earnings',
149
+ metadatas = [
150
+ {'source':i} for i in range(len(texts))]
151
+ )
152
+
153
+ return docsearch
154
 
155
  @st.experimental_memo(suppress_st_warning=True)
156
+ def embed_text(query,corpus,title,embedding_model,emb_tok,chain_type='stuff'):
157
 
158
  '''Embed text and generate semantic search scores'''
159
 
 
169
 
170
  embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
171
 
172
+ title = title[0]
173
+ docsearch = process_corpus(corpus,embed_tok,title, embeddings)
174
+
 
 
 
 
 
 
175
  docs = docsearch.similarity_search_with_score(query, k=3, namespace = f'{title}-earnings')
176
 
177
  docs = [d[0] for d in docs]
 
193
 
194
  elif chain_type == 'refine':
195
 
196
+ initial_qa_prompt = PromptTemplate(
197
+ input_variables=["context_str", "question"], template=initial_qa_template
198
+ )
199
+ chain = load_qa_chain(OpenAI(temperature=0), chain_type="refine", return_refine_steps=False,
200
+ question_prompt=initial_qa_prompt, refine_prompt=refine_prompt)
201
+ answer = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
202
+
203
+ return answer['output_text']
204
 
205
  # @st.experimental_memo(suppress_st_warning=True)
206
  # def embed_text(query,corpus,embedding_model):
 
317
 
318
  @st.experimental_memo(suppress_st_warning=True)
319
  def chunk_long_text(text,threshold,window_size=3,stride=2):
320
+ '''Preprocess text and chunk for sentiment analysis'''
321
 
322
  #Convert cleaned text into sentences
323
  sentences = sent_tokenize(text)