Summarize / summarize.py
IsakNordgren's picture
fler exempel och förbättrad prompt. Testar steg-för-steg-prompting
a7a9d9e
import streamlit as st
import pdfplumber
from PIL import Image
import pytesseract
#from transformers import pipeline
import io
import os
from dotenv import load_dotenv
from groqSummarizer import GroqSummarizer
# SwedishBeagle-dare
from transformers import AutoTokenizer
import transformers
import torch
from streamlit import session_state as ss
from streamlit_pdf_viewer import pdf_viewer
class Summarizer:
def __init__(self, model = "groq"):
self.model = model
def run_app(self):
# For displaying pdf
if 'pdf_ref' not in ss:
ss.pdf_ref = None
uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"], key="file")
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
with st.spinner("Extracting text from PDF..."):
text = self.extract_text_from_pdf(uploaded_file)
if ss.file:
ss.pdf_ref = ss.file
else:
image = Image.open(uploaded_file)
with st.spinner("Extracting text from image..."):
text = self.extract_text_from_image(image)
if text:
with st.spinner("Summarizing text..."):
summary = self.summarize_using_groq(text)
st.subheader("Summary")
st.write(summary)
with st.expander("Extracted Text", expanded = False):
st.write(text)
if ss.pdf_ref:
st.subheader("Original pdf")
binary_data = ss.pdf_ref.getvalue()
pdf_viewer(input=binary_data, width=700)
# Function to extract text from an image
def extract_text_from_image(self, image):
text = pytesseract.image_to_string(image)
return text
# Function to extract text from a PDF
def extract_text_from_pdf(self, pdf):
text = ""
with pdfplumber.open(pdf) as pdf_file:
for page in pdf_file.pages:
text += page.extract_text()
return text
def shorten_text(self, text, max_tokens):
tokens = text.split(" ")
if len(tokens) > max_tokens:
tokens = tokens[:max_tokens]
text = " ".join(tokens)
print("Shortened text to " + str(max_tokens) + " tokens")
return text
def summarize_using_groq(self, text):
# Decrease the number of tokens if the response is 429, i.e. too many tokens in the request
#
# https://context.ai/compare/llama3-70b-instruct-v1/gpt-4
# ^^ Säger att max tokens är 8000, men efter tester så verkar det vara
# närmare 2000 om man räknar att tokens är separerade med blanksteg.
# (Detta är inte ett helt korrekt sätt att räkna det)
# max_tokens = 8000
max_tokens = 2000
while True:
try:
gs = GroqSummarizer()
return gs.summarize(text)
except Exception as e:
if e.response.status_code == 429:
text = self.shorten_text(text, max_tokens)
max_tokens = int(max_tokens * 0.9)
else:
return "Error: " + str(e)
def summarize_using_swedishbeagle(self, text):
# https://huggingface.co/FredrikBL/SwedishBeagle-dare
model = "FredrikBL/SwedishBeagle-dare"
messages = [
{
"role": "system",
"content": "You summarize texts that the users sends"
},
{
"role": "user",
"content": text
}
]
tokenizer = AutoTokenizer.from_pretrained(model)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)
outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
return outputs[0]["generated_text"]
def summarize(self, text):
if(self.model == "groq"):
return self.summarize_using_groq(text)
elif(self.model == "SwedishBeagle-dare"):
return self.summarize_using_swedishbeagle(text)
# Streamlit app
def main():
# Models:
# - groq
# - SwedishBeagle-dare
summarizer = Summarizer(model = "groq")
summarizer.run_app()
if __name__ == "__main__":
main()