# -*- coding: utf-8 -*-
"""
Created on Mon Jul 4 08:43:02 2022
@author: dreji18
"""
import streamlit as st
import hydralit_components as hc
import datetime
import time
from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
from functionforDownloadButtons import download_button
import fitz
import pandas as pd
import base64
# set page size wide and theme
st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}
# app page setup
import hydralit as hy
app = hy.HydraApp(title='Biomedical Epidemiology NER App',
nav_container= None,
nav_horizontal=bool,
layout='wide',
#favicon = "🧊",
use_navbar=True,
navbar_theme=over_theme,
navbar_sticky=True,
navbar_mode='pinned',
use_loader=True,
use_cookie_cache=True,
sidebar_state = 'auto',
navbar_animation=True,
allow_url_nav=False,
hide_streamlit_markers = True,
#use_banner_images=["./background.png",None,{'header':"
Biomedical Epidemiology Entity Recognizer
"},None,"./background.png"],
#banner_spacing=[5,30,60,30,5],
clear_cross_app_sessions=True,
session_params=None
)
# individual pages
@app.addapp(is_home=True)
def my_home():
hy.markdown("Biomedical Epidemiology Named Entity Recognition System
", unsafe_allow_html=True)
st.write("""There are a few challenges related to the task of biomedical named
entity recognition, which are: the existing methods consider a fewer
number of biomedical entities (e.g., disease, symptom, proteins,
genes); and these methods do not consider the social determinants
of health (age, gender, employment, race), which are the non-
medical factors related to patients’ health. We propose a machine
learning pipeline that improves on previous efforts in the following
ways: first, it recognizes many biomedical entity types other than
the standard ones; second, it considers non-clinical factors related
to patient’s health. This pipeline also consists of stages, such as pre-
processing, tokenization, mapping embedding lookup and named
entity recognition task to extract biomedical named entities from
the free texts. We present a new dataset that we prepare by curating
the COVID-19 case reports. The proposed approach outperforms
the baseline methods on five benchmark datasets with macro-and
micro-average F1 scores around 90, as well as our dataset with a
macro-and micro-average F1 score of 95.25 and 93.18 respectively""")
hy.image("Epidemiologist.jpeg")
@app.addapp(title='Entity Recognizer', icon="far fa-copy",)
def app2():
hy.subheader("NER from text corpus")
with hy.form(key="text_form"):
ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
with c1:
hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
hy.image("medical care logo template social media.png")
with c2:
doc = st.text_area(
"Paste your text below (max 500 words)",
height=310,
)
MAX_WORDS = 500
import re
res = len(re.findall(r"\w+", doc))
if res > MAX_WORDS:
st.warning(
"⚠️ Your text contains "
+ str(res)
+ " words."
+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
)
doc = doc[:MAX_WORDS]
submit_button = st.form_submit_button(label="🍃 Get me the data!")
if len(doc)!=0:
pred_df = ner_prediction(corpus=doc, compute='gpu') #pass compute='gpu' if using gpu
with c3:
st.dataframe(pred_df)
CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)")
hy.markdown(" ")
hy.markdown(" ")
hy.markdown(" ")
hy.subheader("NER from Pdf Reports")
with hy.form(key="pdf_form"):
ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
with c1:
hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
hy.image("medical care logo template social media.png")
with c2:
uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
submit_button1 = st.form_submit_button(label="🍃 Get me the data!")
if uploaded_file is not None:
try:
document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
page = 0
final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
while page < document.pageCount:
page_text=document.get_page_text(page)
out = ner_prediction(corpus=page_text, compute='gpu')
output = out.drop_duplicates(subset=["value"],keep='first')
#to iterate through every row in the dataframe
for index, row in output.iterrows():
text = row['value']
#selecting values which has threshold greater than 0.5
#avoiding words less than than length of 3 to avoid false positives
if row["score"] > 0.5 and len(text) > 2:
final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']]
text_instances = document[page].search_for(text)
current_page = document[page]
if text_instances is not None:
#for adding/marking the annotation in the pdf
for inst in text_instances:
#coordinates of the annoation in the pdf
x0,x1,x2,x3 = inst
rect = (x0,x1,x2,x3)
annot = current_page.add_rect_annot(rect)
info = annot.info
info["title"] = row['entity_group']
annot.set_info(info)
annot.update()
page+=1
if len(final_df)!=0:
final_df['Pdf File'] = uploaded_file.name
final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
with c2:
st.dataframe(final_df)
CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)")
else:
print("No Entities Extracted!!!")
document.save(uploaded_file.name.replace(".pdf", "_annot.pdf"))
#final_df.to_csv(uploaded_file.replace(".pdf", "_df.csv"))
#return final_df
with c2:
with open(uploaded_file.name.replace(".pdf", "_annot.pdf"),"rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
pdf_display = f''
st.markdown(pdf_display, unsafe_allow_html=True)
except Exception as e:
print("Error occured: {}".format(e))
raise e
app.run()