IsakNordgren commited on
Commit
79b4e95
·
1 Parent(s): 92a715c

working locally

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
__pycache__/summarize.cpython-310.pyc ADDED
Binary file (3.61 kB). View file
 
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from summarize import Summarizer
4
+
5
+ def main():
6
+ st.title("Text Extractor and Summarizer")
7
+
8
+ summarizer = Summarizer()
9
+ summarizer.run_app()
10
+
11
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pdfplumber
3
+ pillow
4
+ pytesseract
5
+ transformers
6
+ torch
7
+ groq
8
+ python-dotenv
summarize.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ from PIL import Image
4
+ import pytesseract
5
+ #from transformers import pipeline
6
+ import io
7
+
8
+ import os
9
+ from dotenv import load_dotenv
10
+
11
+ # groq
12
+ from groq import Groq
13
+
14
+ # SwedishBeagle-dare
15
+ from transformers import AutoTokenizer
16
+ import transformers
17
+ import torch
18
+
19
+ class Summarizer:
20
+
21
+ def __init__(self, model = "groq"):
22
+ self.model = model
23
+ self.client = self.load_groq()
24
+
25
+ def run_app(self):
26
+ uploaded_file = st.file_uploader("Upload an Image or PDF", type=["jpg", "jpeg", "png", "pdf"])
27
+
28
+ if uploaded_file is not None:
29
+ if uploaded_file.type == "application/pdf":
30
+ with st.spinner("Extracting text from PDF..."):
31
+ text = self.extract_text_from_pdf(uploaded_file)
32
+ else:
33
+ image = Image.open(uploaded_file)
34
+ with st.spinner("Extracting text from image..."):
35
+ text = self.extract_text_from_image(image)
36
+
37
+ if text:
38
+ with st.spinner("Summarizing text..."):
39
+ summary = self.summarize_using_groq(text)
40
+ st.subheader("Summary")
41
+ st.write(summary)
42
+
43
+ st.subheader("Extracted Text")
44
+ st.write(text)
45
+
46
+
47
+ # Function to extract text from an image
48
+ def extract_text_from_image(self, image):
49
+ text = pytesseract.image_to_string(image)
50
+ return text
51
+
52
+ # Function to extract text from a PDF
53
+ def extract_text_from_pdf(self, pdf):
54
+ text = ""
55
+ with pdfplumber.open(pdf) as pdf_file:
56
+ for page in pdf_file.pages:
57
+ text += page.extract_text()
58
+ return text
59
+
60
+ # Function to summarize text
61
+ #def summarize_text(self, text):
62
+ # summarizer = pipeline("summarization")
63
+ # summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
64
+ # return summary[0]['summary_text']
65
+
66
+ def load_groq(self):
67
+ load_dotenv()
68
+
69
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
70
+
71
+ client = Groq(
72
+ api_key=GROQ_API_KEY
73
+ )
74
+
75
+ return client
76
+
77
+ def summarize_using_groq(self, text):
78
+ chat_completion = self.client.chat.completions.create(
79
+ messages=[
80
+ {
81
+ "role": "system",
82
+ "content": "You summarize texts that the users sends"
83
+ },
84
+ {
85
+ "role": "user",
86
+ "content": text,
87
+ }
88
+ ],
89
+ model="mixtral-8x7b-32768",
90
+ )
91
+
92
+ return chat_completion.choices[0].message.content
93
+
94
+ def summarize_using_swedishbeagle(self, text):
95
+ # https://huggingface.co/FredrikBL/SwedishBeagle-dare
96
+
97
+ model = "FredrikBL/SwedishBeagle-dare"
98
+ messages = [
99
+ {
100
+ "role": "system",
101
+ "content": "You summarize texts that the users sends"
102
+ },
103
+ {
104
+ "role": "user",
105
+ "content": text
106
+ }
107
+ ]
108
+
109
+ tokenizer = AutoTokenizer.from_pretrained(model)
110
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ pipeline = transformers.pipeline(
112
+ "text-generation",
113
+ model=model,
114
+ torch_dtype=torch.float16,
115
+ device_map="auto",
116
+ )
117
+
118
+ outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
119
+ return outputs[0]["generated_text"]
120
+
121
+ def summarize(self, text):
122
+ if(self.model == "groq"):
123
+ return self.summarize_using_groq(text)
124
+ elif(self.model == "SwedishBeagle-dare"):
125
+ return self.summarize_using_swedishbeagle(text)
126
+
127
+
128
+ # Streamlit app
129
+ def main():
130
+ # Models:
131
+ # - groq
132
+ # - SwedishBeagle-dare
133
+ summarizer = Summarizer(model="SwedishBeagle-dare")
134
+ summarizer.run_app()
135
+
136
+ if __name__ == "__main__":
137
+ main()