Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
from PIL import Image | |
import pytesseract | |
#from transformers import pipeline | |
import io | |
import os | |
from dotenv import load_dotenv | |
from groqSummarizer import GroqSummarizer | |
# SwedishBeagle-dare | |
from transformers import AutoTokenizer | |
import transformers | |
import torch | |
from streamlit import session_state as ss | |
from streamlit_pdf_viewer import pdf_viewer | |
class Summarizer: | |
def __init__(self, model = "groq"): | |
self.model = model | |
def run_app(self): | |
# For displaying pdf | |
if 'pdf_ref' not in ss: | |
ss.pdf_ref = None | |
uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"], key="file") | |
if uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
with st.spinner("Extracting text from PDF..."): | |
text = self.extract_text_from_pdf(uploaded_file) | |
if ss.file: | |
ss.pdf_ref = ss.file | |
else: | |
image = Image.open(uploaded_file) | |
with st.spinner("Extracting text from image..."): | |
text = self.extract_text_from_image(image) | |
if text: | |
with st.spinner("Summarizing text..."): | |
summary = self.summarize_using_groq(text) | |
st.subheader("Summary") | |
st.write(summary) | |
with st.expander("Extracted Text", expanded = False): | |
st.write(text) | |
if ss.pdf_ref: | |
st.subheader("Original pdf") | |
binary_data = ss.pdf_ref.getvalue() | |
pdf_viewer(input=binary_data, width=700) | |
# Function to extract text from an image | |
def extract_text_from_image(self, image): | |
text = pytesseract.image_to_string(image) | |
return text | |
# Function to extract text from a PDF | |
def extract_text_from_pdf(self, pdf): | |
text = "" | |
with pdfplumber.open(pdf) as pdf_file: | |
for page in pdf_file.pages: | |
text += page.extract_text() | |
return text | |
def shorten_text(self, text, max_tokens): | |
tokens = text.split(" ") | |
if len(tokens) > max_tokens: | |
tokens = tokens[:max_tokens] | |
text = " ".join(tokens) | |
print("Shortened text to " + str(max_tokens) + " tokens") | |
return text | |
def summarize_using_groq(self, text): | |
# Decrease the number of tokens if the response is 429, i.e. too many tokens in the request | |
# | |
# https://context.ai/compare/llama3-70b-instruct-v1/gpt-4 | |
# ^^ Säger att max tokens är 8000, men efter tester så verkar det vara | |
# närmare 2000 om man räknar att tokens är separerade med blanksteg. | |
# (Detta är inte ett helt korrekt sätt att räkna det) | |
# max_tokens = 8000 | |
max_tokens = 2000 | |
while True: | |
try: | |
gs = GroqSummarizer() | |
return gs.summarize(text) | |
except Exception as e: | |
if e.response.status_code == 429: | |
text = self.shorten_text(text, max_tokens) | |
max_tokens = int(max_tokens * 0.9) | |
else: | |
return "Error: " + str(e) | |
def summarize_using_swedishbeagle(self, text): | |
# https://huggingface.co/FredrikBL/SwedishBeagle-dare | |
model = "FredrikBL/SwedishBeagle-dare" | |
messages = [ | |
{ | |
"role": "system", | |
"content": "You summarize texts that the users sends" | |
}, | |
{ | |
"role": "user", | |
"content": text | |
} | |
] | |
tokenizer = AutoTokenizer.from_pretrained(model) | |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) | |
return outputs[0]["generated_text"] | |
def summarize(self, text): | |
if(self.model == "groq"): | |
return self.summarize_using_groq(text) | |
elif(self.model == "SwedishBeagle-dare"): | |
return self.summarize_using_swedishbeagle(text) | |
# Streamlit app | |
def main(): | |
# Models: | |
# - groq | |
# - SwedishBeagle-dare | |
summarizer = Summarizer(model = "groq") | |
summarizer.run_app() | |
if __name__ == "__main__": | |
main() |