# -*- coding: utf-8 -*- """ Created on Mon Jul 4 08:43:02 2022 @author: dreji18 """ import streamlit as st import hydralit_components as hc import datetime import time from Bio_Epidemiology_NER.bio_recognizer import ner_prediction from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit from functionforDownloadButtons import download_button import fitz import pandas as pd import base64 # set page size wide and theme st.set_page_config(layout='wide', initial_sidebar_state='collapsed',) over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'} # app page setup import hydralit as hy app = hy.HydraApp(title='Biomedical Epidemiology NER App', nav_container= None, nav_horizontal=bool, layout='wide', #favicon = "🧊", use_navbar=True, navbar_theme=over_theme, navbar_sticky=True, navbar_mode='pinned', use_loader=True, use_cookie_cache=True, sidebar_state = 'auto', navbar_animation=True, allow_url_nav=False, hide_streamlit_markers = True, #use_banner_images=["./background.png",None,{'header':"

Biomedical Epidemiology Entity Recognizer


"},None,"./background.png"], #banner_spacing=[5,30,60,30,5], clear_cross_app_sessions=True, session_params=None ) # individual pages @app.addapp(is_home=True) def my_home(): hy.markdown("

Biomedical Epidemiology Named Entity Recognition System

", unsafe_allow_html=True) st.write("""There are a few challenges related to the task of biomedical named entity recognition, which are: the existing methods consider a fewer number of biomedical entities (e.g., disease, symptom, proteins, genes); and these methods do not consider the social determinants of health (age, gender, employment, race), which are the non- medical factors related to patients’ health. We propose a machine learning pipeline that improves on previous efforts in the following ways: first, it recognizes many biomedical entity types other than the standard ones; second, it considers non-clinical factors related to patient’s health. This pipeline also consists of stages, such as pre- processing, tokenization, mapping embedding lookup and named entity recognition task to extract biomedical named entities from the free texts. We present a new dataset that we prepare by curating the COVID-19 case reports. The proposed approach outperforms the baseline methods on five benchmark datasets with macro-and micro-average F1 scores around 90, as well as our dataset with a macro-and micro-average F1 score of 95.25 and 93.18 respectively""") hy.image("Epidemiologist.jpeg") @app.addapp(title='Entity Recognizer', icon="far fa-copy",) def app2(): hy.subheader("NER from text corpus") with hy.form(key="text_form"): ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5]) with c1: hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities") hy.image("medical care logo template social media.png") with c2: doc = st.text_area( "Paste your text below (max 500 words)", height=310, ) MAX_WORDS = 500 import re res = len(re.findall(r"\w+", doc)) if res > MAX_WORDS: st.warning( "⚠️ Your text contains " + str(res) + " words." + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊" ) doc = doc[:MAX_WORDS] submit_button = st.form_submit_button(label="🍃 Get me the data!") if len(doc)!=0: pred_df = ner_prediction(corpus=doc, compute='gpu') #pass compute='gpu' if using gpu with c3: st.dataframe(pred_df) CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)") hy.markdown(" ") hy.markdown(" ") hy.markdown(" ") hy.subheader("NER from Pdf Reports") with hy.form(key="pdf_form"): ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5]) with c1: hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities") hy.image("medical care logo template social media.png") with c2: uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"]) submit_button1 = st.form_submit_button(label="🍃 Get me the data!") if uploaded_file is not None: try: document = fitz.open(stream=uploaded_file.read(), filetype="pdf") page = 0 final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"]) while page < document.pageCount: page_text=document.get_page_text(page) out = ner_prediction(corpus=page_text, compute='gpu') output = out.drop_duplicates(subset=["value"],keep='first') #to iterate through every row in the dataframe for index, row in output.iterrows(): text = row['value'] #selecting values which has threshold greater than 0.5 #avoiding words less than than length of 3 to avoid false positives if row["score"] > 0.5 and len(text) > 2: final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']] text_instances = document[page].search_for(text) current_page = document[page] if text_instances is not None: #for adding/marking the annotation in the pdf for inst in text_instances: #coordinates of the annoation in the pdf x0,x1,x2,x3 = inst rect = (x0,x1,x2,x3) annot = current_page.add_rect_annot(rect) info = annot.info info["title"] = row['entity_group'] annot.set_info(info) annot.update() page+=1 if len(final_df)!=0: final_df['Pdf File'] = uploaded_file.name final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']] with c2: st.dataframe(final_df) CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)") else: print("No Entities Extracted!!!") document.save(uploaded_file.name.replace(".pdf", "_annot.pdf")) #final_df.to_csv(uploaded_file.replace(".pdf", "_df.csv")) #return final_df with c2: with open(uploaded_file.name.replace(".pdf", "_annot.pdf"),"rb") as f: base64_pdf = base64.b64encode(f.read()).decode('utf-8') pdf_display = f'' st.markdown(pdf_display, unsafe_allow_html=True) except Exception as e: print("Error occured: {}".format(e)) raise e app.run()