awacke1 commited on
Commit
7e31ed4
·
1 Parent(s): d54014f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -11
app.py CHANGED
@@ -1,15 +1,218 @@
1
- from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
2
- import torch
 
 
 
 
 
 
3
  import gradio as gr
4
 
5
- import os
6
- import csv
7
- from gradio import inputs, outputs
8
- from datetime import datetime
9
- import fastapi
10
- from typing import List, Dict
11
- import httpx
12
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  UseMemory=True
@@ -135,4 +338,4 @@ with gr.Blocks() as demo:
135
 
136
  b1.click(fn=chat, inputs=[t1, s1], outputs=[s1, df1, file])
137
 
138
- demo.launch(debug=True, show_error=True)
 
1
+ import spacy
2
+ import wikipediaapi
3
+ import wikipedia
4
+ from wikipedia.exceptions import DisambiguationError
5
+ from transformers import TFAutoModel, AutoTokenizer
6
+ import numpy as np
7
+ import pandas as pd
8
+ import faiss
9
  import gradio as gr
10
 
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ except:
14
+ spacy.cli.download("en_core_web_sm")
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ wh_words = ['what', 'who', 'how', 'when', 'which']
18
+ def get_concepts(text):
19
+ text = text.lower()
20
+ doc = nlp(text)
21
+ concepts = []
22
+ for chunk in doc.noun_chunks:
23
+ if chunk.text not in wh_words:
24
+ concepts.append(chunk.text)
25
+ return concepts
26
+
27
+ def get_passages(text, k=100):
28
+ doc = nlp(text)
29
+ passages = []
30
+ passage_len = 0
31
+ passage = ""
32
+ sents = list(doc.sents)
33
+ for i in range(len(sents)):
34
+ sen = sents[i]
35
+ passage_len+=len(sen)
36
+ if passage_len >= k:
37
+ passages.append(passage)
38
+ passage = sen.text
39
+ passage_len = len(sen)
40
+ continue
41
+
42
+ elif i==(len(sents)-1):
43
+ passage+=" "+sen.text
44
+ passages.append(passage)
45
+ passage = ""
46
+ passage_len = 0
47
+ continue
48
+
49
+ passage+=" "+sen.text
50
+ return passages
51
+
52
+ def get_dicts_for_dpr(concepts, n_results=20, k=100):
53
+ dicts = []
54
+ for concept in concepts:
55
+ wikis = wikipedia.search(concept, results=n_results)
56
+ print(concept, "No of Wikis: ",len(wikis))
57
+ for wiki in wikis:
58
+ try:
59
+ html_page = wikipedia.page(title = wiki, auto_suggest = False)
60
+ except DisambiguationError:
61
+ continue
62
+
63
+ htmlResults=html_page.content
64
+
65
+ passages = get_passages(htmlResults, k=k)
66
+ for passage in passages:
67
+ i_dicts = {}
68
+ i_dicts['text'] = passage
69
+ i_dicts['title'] = wiki
70
+ dicts.append(i_dicts)
71
+ return dicts
72
+
73
+ passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
74
+ query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
75
+ p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
76
+ q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
77
+
78
+ def get_title_text_combined(passage_dicts):
79
+ res = []
80
+ for p in passage_dicts:
81
+ res.append(tuple((p['title'], p['text'])))
82
+ return res
83
+
84
+ def extracted_passage_embeddings(processed_passages, max_length=156):
85
+ passage_inputs = p_tokenizer.batch_encode_plus(
86
+ processed_passages,
87
+ add_special_tokens=True,
88
+ truncation=True,
89
+ padding="max_length",
90
+ max_length=max_length,
91
+ return_token_type_ids=True
92
+ )
93
+ passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
94
+ np.array(passage_inputs['attention_mask']),
95
+ np.array(passage_inputs['token_type_ids'])],
96
+ batch_size=64,
97
+ verbose=1)
98
+ return passage_embeddings
99
+
100
+ def extracted_query_embeddings(queries, max_length=64):
101
+ query_inputs = q_tokenizer.batch_encode_plus(
102
+ queries,
103
+ add_special_tokens=True,
104
+ truncation=True,
105
+ padding="max_length",
106
+ max_length=max_length,
107
+ return_token_type_ids=True
108
+ )
109
+ query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
+ np.array(query_inputs['attention_mask']),
111
+ np.array(query_inputs['token_type_ids'])],
112
+ batch_size=1,
113
+ verbose=1)
114
+ return query_embeddings
115
+
116
+ #Wikipedia API:
117
+
118
+ def get_pagetext(page):
119
+ s=str(page).replace("/t","")
120
+
121
+ return s
122
+
123
+ def get_wiki_summary(search):
124
+ wiki_wiki = wikipediaapi.Wikipedia('en')
125
+ page = wiki_wiki.page(search)
126
+
127
+ isExist = page.exists()
128
+ if not isExist:
129
+ return isExist, "Not found", "Not found", "Not found", "Not found"
130
+
131
+ pageurl = page.fullurl
132
+ pagetitle = page.title
133
+ pagesummary = page.summary[0:60]
134
+ pagetext = get_pagetext(page.text)
135
+
136
+ backlinks = page.backlinks
137
+ linklist = ""
138
+ for link in backlinks.items():
139
+ pui = link[0]
140
+ linklist += pui + " , "
141
+ a=1
142
+
143
+ categories = page.categories
144
+ categorylist = ""
145
+ for category in categories.items():
146
+ pui = category[0]
147
+ categorylist += pui + " , "
148
+ a=1
149
+
150
+ links = page.links
151
+ linklist2 = ""
152
+ for link in links.items():
153
+ pui = link[0]
154
+ linklist2 += pui + " , "
155
+ a=1
156
+
157
+ sections = page.sections
158
+
159
+
160
+ ex_dic = {
161
+ 'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
162
+ 'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
163
+ }
164
+
165
+ #columns = [pageurl,pagetitle]
166
+ #index = [pagesummary,pagetext]
167
+ #df = pd.DataFrame(page, columns=columns, index=index)
168
+ #df = pd.DataFrame(ex_dic, columns=columns, index=index)
169
+ df = pd.DataFrame(ex_dic)
170
+
171
+ return df
172
+
173
+
174
+ def search(question):
175
+ concepts = get_concepts(question)
176
+ print("concepts: ",concepts)
177
+ dicts = get_dicts_for_dpr(concepts, n_results=1)
178
+ lendicts = len(dicts)
179
+ print("dicts len: ", lendicts)
180
+ if lendicts == 0:
181
+ return pd.DataFrame()
182
+ processed_passages = get_title_text_combined(dicts)
183
+ passage_embeddings = extracted_passage_embeddings(processed_passages)
184
+ query_embeddings = extracted_query_embeddings([question])
185
+ faiss_index = faiss.IndexFlatL2(128)
186
+ faiss_index.add(passage_embeddings.pooler_output)
187
+ # prob, index = faiss_index.search(query_embeddings.pooler_output, k=1000)
188
+ prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
189
+ return pd.DataFrame([dicts[i] for i in index[0]])
190
+
191
+
192
+ # AI UI SOTA - gradio blocks with UI formatting, and event driven UI
193
+ with gr.Blocks() as demo: # Block documentation on event listeners, start here: https://gradio.app/blocks_and_event_listeners/
194
+
195
+
196
+ gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
197
+ gr.Markdown("""<div align="center">Search and Find Anything Then Use in AI! <a href="https://www.mediawiki.org/wiki/API:Main_page">MediaWiki - API for Wikipedia</a>. <a href="https://paperswithcode.com/datasets?q=wikipedia&v=lst&o=newest">Papers,Code,Datasets for SOTA w/ Wikipedia</a>""")
198
+ with gr.Row(): # inputs and buttons
199
+ inp = gr.Textbox(lines=1, default="Syd Mead", label="Question")
200
+ with gr.Row(): # inputs and buttons
201
+ b3 = gr.Button("Search AI Summaries")
202
+ b4 = gr.Button("Search Web Live")
203
+ with gr.Row(): # outputs DF1
204
+ out = gr.Dataframe(label="Answers", type="pandas")
205
+ with gr.Row(): # output DF2
206
+ out_DF = gr.Dataframe(wrap=True, max_rows=1000, overflow_row_behaviour= "paginate", datatype = ["markdown", "markdown"], headers=['Entity', 'Value'])
207
+ inp.submit(fn=get_wiki_summary, inputs=inp, outputs=out_DF)
208
+ b3.click(fn=search, inputs=inp, outputs=out)
209
+ b4.click(fn=get_wiki_summary, inputs=inp, outputs=out_DF )
210
+ demo.launch(debug=True, show_error=True)
211
+
212
+
213
+
214
+
215
+
216
 
217
 
218
  UseMemory=True
 
338
 
339
  b1.click(fn=chat, inputs=[t1, s1], outputs=[s1, df1, file])
340
 
341
+ demo.launch(debug=True, show_error=True)