daneshjoy commited on
Commit
06651f1
·
1 Parent(s): 7c3d9d1

using prepared doc_store

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +5 -79
  3. doc_store.zip +3 -0
  4. requirements.txt +1 -1
.gitignore CHANGED
@@ -1 +1,2 @@
1
  __pycache__/
 
 
1
  __pycache__/
2
+ data/
app.py CHANGED
@@ -1,21 +1,14 @@
 
 
1
  import os
2
 
3
  import streamlit as st
4
- from haystack.document_stores import FAISSDocumentStore
5
- from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text
6
- from haystack.nodes import DensePassageRetriever
7
- from haystack.utils import print_documents, print_answers
8
- from haystack.pipelines import DocumentSearchPipeline
9
- from haystack.nodes import Seq2SeqGenerator
10
- from haystack.pipelines import GenerativeQAPipeline
11
- from haystack.utils import convert_files_to_docs, clean_wiki_text
12
 
13
  from lfqa import prepare, answer
14
 
15
 
16
- doc_dir = "./data/wiki_gameofthrones_txt12"
17
- sql_file = './faiss_doc_store.db'
18
- faiss_file = './faiss_index.faiss'
19
 
20
  # Sliders
21
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
@@ -130,72 +123,5 @@ def main(pipe):
130
  st.write(st.session_state.results['answers'][0].meta['content'][i])
131
  st.markdown('---\n')
132
 
133
- # %% ------------------------------------------- Creating Doc store
134
- # if not os.path.exists(sql_file) or not os.path.exists(faiss_file):
135
-
136
- module_dir = os.path.dirname(os.path.abspath(__file__))
137
- os.chdir(module_dir)
138
-
139
-
140
-
141
- # %% Download/Load Docs
142
-
143
- # Get some files that we want to use
144
- # s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
145
- # fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
146
-
147
- print('---> Loading Documents ...')
148
-
149
- # Convert files to docs + cleaning
150
- docs = convert_files_to_docs(dir_path=doc_dir,
151
- clean_func=clean_wiki_text,
152
- split_paragraphs=True)
153
-
154
- # %% Document Store
155
-
156
- print('---> Creating document store ...')
157
- # # custom path for sql file
158
- # document_store = FAISSDocumentStore(embedding_dim=128,
159
- # faiss_index_factory_str="Flat",
160
- # sql_url=f"sqlite:///{sql_file}")
161
-
162
- # In memory database
163
- document_store = FAISSDocumentStore(embedding_dim=128,
164
- faiss_index_factory_str="Flat",
165
- sql_url=f"sqlite://")
166
-
167
- # # default path for sql file
168
- # document_store = FAISSDocumentStore(embedding_dim=128,
169
- # faiss_index_factory_str="Flat")
170
-
171
-
172
-
173
- # %% Retriever (DPR)
174
-
175
- print('---> Initializing retriever ...')
176
- retriever = DensePassageRetriever(
177
- document_store=document_store,
178
- query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
179
- passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
180
- use_gpu=False
181
- )
182
-
183
- # %% Create Embeddings and save results
184
- document_store.update_embeddings(retriever)
185
-
186
- print('---> Saving results ...')
187
- # update db
188
- document_store.write_documents(docs)
189
- # save faiss file
190
- document_store.save(faiss_file)
191
-
192
- print('Done!')
193
-
194
-
195
- # %% ------------------------------------------- Main App
196
-
197
- generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa", use_gpu=False)
198
-
199
- pipe = GenerativeQAPipeline(generator, retriever)
200
- # pipe = prepare()
201
  main(pipe)
 
1
+ from zipfile import ZipFile
2
+
3
  import os
4
 
5
  import streamlit as st
 
 
 
 
 
 
 
 
6
 
7
  from lfqa import prepare, answer
8
 
9
 
10
+ with ZipFile("doc_store.zip","r") as zip_ref:
11
+ zip_ref.extractall('.')
 
12
 
13
  # Sliders
14
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
 
123
  st.write(st.session_state.results['answers'][0].meta['content'][i])
124
  st.markdown('---\n')
125
 
126
+ pipe = prepare()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  main(pipe)
doc_store.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c25c0f4d55c7d80aa4525d619a11531d9a5c316d5022cb8927bdd19c635747
3
+ size 2589071
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- farm-haystack[docstores,preprocessing,ocr,faiss]
2
  streamlit >= 1.9.0, < 2
3
  st-annotated-text >= 2.0.0, < 3
 
1
+ farm-haystack[ocr,faiss]
2
  streamlit >= 1.9.0, < 2
3
  st-annotated-text >= 2.0.0, < 3