Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 21, 2022

Commit

f5555cd

1 Parent(s): 2d39184

experiment with summarization

Browse files

Files changed (1) hide show

app.py +26 -8

app.py CHANGED Viewed

@@ -149,9 +149,10 @@ def init_models():
     reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-2-v2', device=device)
     # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
-    return question_answerer, reranker, stop, device # uqeryexp_model, queryexp_tokenizer
-qa_model, reranker, stop, device = init_models() # queryexp_model, queryexp_tokenizer
 def clean_query(query, strict=True, clean=True):
@@ -212,6 +213,9 @@ st.markdown("""
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
     support_all = st.radio(
         "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
@@ -267,6 +271,21 @@ def matched_context(start_i, end_i, contexts_string, seperator='---'):
     return None
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
@@ -275,10 +294,6 @@ def run_query(query):
 # * {query_exp}
 # """)
-    # address period in highlitht avoidability. Risk factors
-    # address poor tokenization Deletions involving chromosome region 4p16.3 cause WolfHirschhorn syndrome (WHS, OMIM 194190) [Battaglia et al, 2001].
-    # address highlight html
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
@@ -346,10 +361,13 @@ def run_query(query):
     else:
         threshold = (confidence_threshold or 10) / 100
-    sorted_result = filter(
         lambda x: x['score'] > threshold,
         sorted_result
-    )
     for r in sorted_result:
         ctx = remove_html(r["context"])

     reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-2-v2', device=device)
     # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
     # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
+    summarizer = pipeline("summarization")
+    return question_answerer, reranker, stop, device, summarizer
+qa_model, reranker, stop, device, summarizer = init_models() # queryexp_model, queryexp_tokenizer
 def clean_query(query, strict=True, clean=True):
 """, unsafe_allow_html=True)
 with st.expander("Settings (strictness, context limit, top hits)"):
+    use_mds = st.radio(
+        "Use multi-document summarization to summarize answer?",
+        ('yes', 'no'))
     support_all = st.radio(
         "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
         ('yes', 'no'))
     return None
+def gen_summary(query, sorted_result):
+    doc_sep = '\n'
+    summary = summarizer(f'{query} '.join([f'{doc_sep}'.join(r['texts']) + r['context'] for r in sorted_result]))[0]['summary_text']
+    st.markdown(f"""
+    <div class="container-fluid">
+    <div class="row align-items-start">
+            <div  class="col-md-12 col-sm-12">
+        <strong>Answer:</strong> {summary}
+        </div>
+    </div>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("<br /><br /><h5>Sources:</h5>", unsafe_allow_html=True)
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
 # * {query_exp}
 # """)
     # could also try fallback if there are no good answers by score...
     limit = top_hits_limit or 100
     context_limit = context_lim or 10
     else:
         threshold = (confidence_threshold or 10) / 100
+    sorted_result = list(filter(
         lambda x: x['score'] > threshold,
         sorted_result
+    ))
+    if use_mds == 'yes':
+        gen_summary(query, sorted_result)
     for r in sorted_result:
         ctx = remove_html(r["context"])