Spaces:
Running
on
Zero
Running
on
Zero
srijaydeshpande
commited on
Update
Browse files
app.py
CHANGED
@@ -2,16 +2,12 @@ from pdfminer.high_level import extract_pages
|
|
2 |
from pdfminer.layout import LTTextContainer
|
3 |
from tqdm import tqdm
|
4 |
import re
|
5 |
-
|
|
|
6 |
import gradio as gr
|
7 |
import os
|
8 |
from llama_cpp import Llama
|
9 |
-
|
10 |
-
import transformers
|
11 |
-
# from transformers import GemmaTokenizer, AutoModelForCausalLM
|
12 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
13 |
-
import accelerate
|
14 |
-
import torch
|
15 |
|
16 |
def process_document(pdf_path, page_ids=None):
|
17 |
|
@@ -112,33 +108,32 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top
|
|
112 |
top_p=top_probability,
|
113 |
)
|
114 |
output = outputs[0]["generated_text"][len(prompt):]
|
115 |
-
|
116 |
return output
|
117 |
|
|
|
|
|
|
|
|
|
118 |
def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
for file in output_files:
|
138 |
-
zipf.write(file, os.path.basename(file))
|
139 |
-
return 'anonymized_reports'
|
140 |
-
|
141 |
-
# model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
|
142 |
# model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
|
143 |
|
144 |
# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
|
|
|
2 |
from pdfminer.layout import LTTextContainer
|
3 |
from tqdm import tqdm
|
4 |
import re
|
5 |
+
import io
|
6 |
+
import zipfile
|
7 |
import gradio as gr
|
8 |
import os
|
9 |
from llama_cpp import Llama
|
10 |
+
import tempfile
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def process_document(pdf_path, page_ids=None):
|
13 |
|
|
|
108 |
top_p=top_probability,
|
109 |
)
|
110 |
output = outputs[0]["generated_text"][len(prompt):]
|
111 |
+
|
112 |
return output
|
113 |
|
114 |
+
def mkdir(dir):
|
115 |
+
if not os.path.exists(dir):
|
116 |
+
os.makedirs(dir)
|
117 |
+
|
118 |
def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
|
119 |
+
zip_buffer = io.BytesIO()
|
120 |
+
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
|
121 |
+
for file in files:
|
122 |
+
file_name = os.path.basename(file)
|
123 |
+
file_name_splt = file_name.split('.')
|
124 |
+
if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
|
125 |
+
page2content = process_document(file, page_ids=[0])
|
126 |
+
pdftext = page2content[1]
|
127 |
+
if(pdftext):
|
128 |
+
anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
|
129 |
+
zf.writestr(file_name_splt[0]+'.txt', anonymized_text)
|
130 |
+
zip_buffer.seek(0)
|
131 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as temp_file:
|
132 |
+
temp_file.write(zip_buffer.getvalue())
|
133 |
+
temp_file_path = temp_file.name
|
134 |
+
return temp_file_path
|
135 |
+
|
136 |
+
# model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
|
|
|
|
|
|
|
|
|
|
|
137 |
# model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
|
138 |
|
139 |
# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
|