Spaces:
Sleeping
Sleeping
zeyadahmedd
commited on
Commit
·
0e1d9bb
1
Parent(s):
b884c59
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import time
|
2 |
-
|
3 |
import chromadb
|
4 |
from chromadb.utils import embedding_functions
|
5 |
from test.new import connect_to_llama
|
@@ -10,7 +8,6 @@ import os
|
|
10 |
from chunkipy.text_chunker import split_by_sentences
|
11 |
import langid
|
12 |
from translate import Translator
|
13 |
-
|
14 |
chroma_client = chromadb.PersistentClient()
|
15 |
from test.llama import llama_local
|
16 |
working_dir = os.getcwd()
|
@@ -32,22 +29,25 @@ def detect_and_translate_query(query, context, dest_language='en'):
|
|
32 |
translated_context = translator.translate(context)
|
33 |
return translated_query, translated_context, input_language
|
34 |
|
|
|
35 |
def translate_response(response, source_language, dest_language):
|
36 |
translator = Translator(to_lang=source_language, from_lang=dest_language)
|
37 |
translated_response = translator.translate(response)
|
38 |
-
print("translate_response "+str(translate_response))
|
39 |
return translated_response
|
40 |
-
|
|
|
|
|
41 |
filelist = os.listdir(path)
|
42 |
print(filelist)
|
43 |
data_pdfs = []
|
44 |
-
metadata_buff=[]
|
45 |
for file_n in filelist:
|
46 |
with open(file_n, 'rb') as file:
|
47 |
pdf_reader = PyPDF2.PdfReader(file)
|
48 |
-
meta_data=dict(pdf_reader.metadata)
|
49 |
-
print("De elmeta data before: ",meta_data)
|
50 |
-
meta_data.update({"/Title":file_n})
|
51 |
print("De elmeta data after: ", meta_data)
|
52 |
metadata_buff.append(meta_data)
|
53 |
data = ""
|
@@ -59,22 +59,23 @@ def create_multiple_db(path,collection,working_dir):
|
|
59 |
data_pdfs.append(chunk)
|
60 |
file.close()
|
61 |
os.chdir(working_dir)
|
62 |
-
print(metadata_buff,"\n",len(metadata_buff))
|
63 |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
64 |
i = 0
|
65 |
-
md_i=0
|
66 |
for data in data_pdfs:
|
67 |
print(data)
|
68 |
collection.add(
|
69 |
documents=data,
|
70 |
embeddings=sentence_transformer_ef(data),
|
71 |
ids=['id' + str(x + i) for x in range(len(data))],
|
72 |
-
metadatas=[metadata_buff[md_i]for i in range(len(data))]
|
73 |
)
|
74 |
-
md_i+=1
|
75 |
i += len(data)
|
76 |
return "done"
|
77 |
-
|
|
|
78 |
def architecture_with_chroma(data):
|
79 |
try:
|
80 |
data_dict = eval(data)
|
@@ -87,20 +88,20 @@ def architecture_with_chroma(data):
|
|
87 |
query = data_dict.get('query')
|
88 |
if query is None or query == "":
|
89 |
return "please enter a query to process"
|
90 |
-
if(not os.path.exists(id)):
|
91 |
return "sorry ,there is no directory for this client"
|
92 |
collection = chroma_client.get_or_create_collection(name=id)
|
93 |
results = collection.query(
|
94 |
query_texts=[query],
|
95 |
-
n_results=
|
96 |
)
|
97 |
-
print(results," de elresults\n")
|
98 |
context = results.get('documents')[0]
|
99 |
results_metadata = list(results.get("metadatas")[0])
|
100 |
results_documents = list(results.get("documents")[0])
|
101 |
-
print(len(results_documents),"da el len bta3 elcontexts\n")
|
102 |
print(results_documents)
|
103 |
-
for i in range(
|
104 |
results_documents[i] = f"In {results_metadata[i].get('/Title')}:" + results_documents[i]
|
105 |
for data in results_documents:
|
106 |
print(data)
|
@@ -108,54 +109,54 @@ def architecture_with_chroma(data):
|
|
108 |
# generated_text = model(input_prompt.format(query+"? answer reasoning answers from the provided contexts only that is related and contains this information ", context), max_length=1024, do_sample=False)[0]['generated_text']
|
109 |
# print(input_prompt)
|
110 |
chroma_client.stop()
|
111 |
-
translated_query, translated_context, input_language = detect_and_translate_query(query, context)
|
112 |
-
print('translated_query '+str(translated_query))
|
113 |
-
print('translated_context '+str(translated_context))
|
114 |
-
results=connect_to_llama(query,results_documents)
|
115 |
# results=llama_local(query,results_documents)
|
116 |
-
translated_response = translate_response(results, input_language, dest_language='en')
|
117 |
-
return translated_response
|
118 |
-
|
119 |
# return generated_text
|
|
|
|
|
120 |
def create(data):
|
121 |
print(data)
|
122 |
print(type(data))
|
123 |
try:
|
124 |
-
dict=eval(data)
|
125 |
except:
|
126 |
return "please enter a valid json (dict) to process"
|
127 |
-
id=dict.get('id')
|
128 |
-
if id==None
|
129 |
return "please enter an id to process on the prompt"
|
130 |
-
id="mate"+str(id)
|
131 |
-
if(not os.path.exists(id)):
|
132 |
return "sorry ,there is no directory for this client"
|
133 |
else:
|
134 |
collection = chroma_client.get_or_create_collection(name=id)
|
135 |
print(os.chdir(id))
|
136 |
-
return create_multiple_db(os.getcwd(),collection,working_dir)+" making data for client"
|
|
|
137 |
|
138 |
def update(data):
|
139 |
print(data)
|
140 |
print(type(data))
|
141 |
try:
|
142 |
-
dict=eval(data)
|
143 |
except:
|
144 |
return "please enter a valid json (dict) to process"
|
145 |
-
id=dict.get('id')
|
146 |
-
if id==None
|
147 |
return "please enter an id to process on the prompt"
|
148 |
-
id="mate"+str(dict.get('id'))
|
149 |
-
if(not os.path.exists(id)):
|
150 |
return "sorry ,there is no directory for this client"
|
151 |
else:
|
152 |
-
|
153 |
-
chroma_client.delete_collection(name=id)
|
154 |
-
except error:
|
155 |
-
pass
|
156 |
-
collection=chroma_client.create_collection(name=id)
|
157 |
print(os.chdir(id))
|
158 |
-
return create_multiple_db(os.getcwd(),collection,working_dir)+"updating client embeddings"
|
|
|
159 |
|
160 |
iface = gr.Blocks()
|
161 |
with iface:
|
|
|
|
|
|
|
1 |
import chromadb
|
2 |
from chromadb.utils import embedding_functions
|
3 |
from test.new import connect_to_llama
|
|
|
8 |
from chunkipy.text_chunker import split_by_sentences
|
9 |
import langid
|
10 |
from translate import Translator
|
|
|
11 |
chroma_client = chromadb.PersistentClient()
|
12 |
from test.llama import llama_local
|
13 |
working_dir = os.getcwd()
|
|
|
29 |
translated_context = translator.translate(context)
|
30 |
return translated_query, translated_context, input_language
|
31 |
|
32 |
+
|
33 |
def translate_response(response, source_language, dest_language):
|
34 |
translator = Translator(to_lang=source_language, from_lang=dest_language)
|
35 |
translated_response = translator.translate(response)
|
36 |
+
print("translate_response " + str(translate_response))
|
37 |
return translated_response
|
38 |
+
|
39 |
+
|
40 |
+
def create_multiple_db(path, collection, working_dir):
|
41 |
filelist = os.listdir(path)
|
42 |
print(filelist)
|
43 |
data_pdfs = []
|
44 |
+
metadata_buff = []
|
45 |
for file_n in filelist:
|
46 |
with open(file_n, 'rb') as file:
|
47 |
pdf_reader = PyPDF2.PdfReader(file)
|
48 |
+
meta_data = dict(pdf_reader.metadata)
|
49 |
+
print("De elmeta data before: ", meta_data)
|
50 |
+
meta_data.update({"/Title": file_n})
|
51 |
print("De elmeta data after: ", meta_data)
|
52 |
metadata_buff.append(meta_data)
|
53 |
data = ""
|
|
|
59 |
data_pdfs.append(chunk)
|
60 |
file.close()
|
61 |
os.chdir(working_dir)
|
62 |
+
print(metadata_buff, "\n", len(metadata_buff))
|
63 |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
64 |
i = 0
|
65 |
+
md_i = 0
|
66 |
for data in data_pdfs:
|
67 |
print(data)
|
68 |
collection.add(
|
69 |
documents=data,
|
70 |
embeddings=sentence_transformer_ef(data),
|
71 |
ids=['id' + str(x + i) for x in range(len(data))],
|
72 |
+
metadatas=[metadata_buff[md_i] for i in range(len(data))]
|
73 |
)
|
74 |
+
md_i += 1
|
75 |
i += len(data)
|
76 |
return "done"
|
77 |
+
|
78 |
+
|
79 |
def architecture_with_chroma(data):
|
80 |
try:
|
81 |
data_dict = eval(data)
|
|
|
88 |
query = data_dict.get('query')
|
89 |
if query is None or query == "":
|
90 |
return "please enter a query to process"
|
91 |
+
if (not os.path.exists(id)):
|
92 |
return "sorry ,there is no directory for this client"
|
93 |
collection = chroma_client.get_or_create_collection(name=id)
|
94 |
results = collection.query(
|
95 |
query_texts=[query],
|
96 |
+
n_results=10
|
97 |
)
|
98 |
+
print(results, " de elresults\n")
|
99 |
context = results.get('documents')[0]
|
100 |
results_metadata = list(results.get("metadatas")[0])
|
101 |
results_documents = list(results.get("documents")[0])
|
102 |
+
print(len(results_documents), "da el len bta3 elcontexts\n")
|
103 |
print(results_documents)
|
104 |
+
for i in range(10):
|
105 |
results_documents[i] = f"In {results_metadata[i].get('/Title')}:" + results_documents[i]
|
106 |
for data in results_documents:
|
107 |
print(data)
|
|
|
109 |
# generated_text = model(input_prompt.format(query+"? answer reasoning answers from the provided contexts only that is related and contains this information ", context), max_length=1024, do_sample=False)[0]['generated_text']
|
110 |
# print(input_prompt)
|
111 |
chroma_client.stop()
|
112 |
+
# translated_query, translated_context, input_language = detect_and_translate_query(query, context)
|
113 |
+
# print('translated_query ' + str(translated_query))
|
114 |
+
# print('translated_context ' + str(translated_context))
|
115 |
+
results = connect_to_llama(query, results_documents)
|
116 |
# results=llama_local(query,results_documents)
|
117 |
+
# translated_response = translate_response(results, input_language, dest_language='en')
|
118 |
+
# return translated_response
|
119 |
+
return results
|
120 |
# return generated_text
|
121 |
+
|
122 |
+
|
123 |
def create(data):
|
124 |
print(data)
|
125 |
print(type(data))
|
126 |
try:
|
127 |
+
dict = eval(data)
|
128 |
except:
|
129 |
return "please enter a valid json (dict) to process"
|
130 |
+
id = dict.get('id')
|
131 |
+
if id == None:
|
132 |
return "please enter an id to process on the prompt"
|
133 |
+
id = "mate" + str(id)
|
134 |
+
if (not os.path.exists(id)):
|
135 |
return "sorry ,there is no directory for this client"
|
136 |
else:
|
137 |
collection = chroma_client.get_or_create_collection(name=id)
|
138 |
print(os.chdir(id))
|
139 |
+
return create_multiple_db(os.getcwd(), collection, working_dir) + " making data for client"
|
140 |
+
|
141 |
|
142 |
def update(data):
|
143 |
print(data)
|
144 |
print(type(data))
|
145 |
try:
|
146 |
+
dict = eval(data)
|
147 |
except:
|
148 |
return "please enter a valid json (dict) to process"
|
149 |
+
id = dict.get('id')
|
150 |
+
if id == None:
|
151 |
return "please enter an id to process on the prompt"
|
152 |
+
id = "mate" + str(dict.get('id'))
|
153 |
+
if (not os.path.exists(id)):
|
154 |
return "sorry ,there is no directory for this client"
|
155 |
else:
|
156 |
+
collection = chroma_client.create_collection(name=id)
|
|
|
|
|
|
|
|
|
157 |
print(os.chdir(id))
|
158 |
+
return create_multiple_db(os.getcwd(), collection, working_dir) + "updating client embeddings"
|
159 |
+
|
160 |
|
161 |
iface = gr.Blocks()
|
162 |
with iface:
|