srijaydeshpande commited on
Commit
b0abf08
·
verified ·
1 Parent(s): 12498a0
Files changed (1) hide show
  1. app.py +26 -31
app.py CHANGED
@@ -2,16 +2,12 @@ from pdfminer.high_level import extract_pages
2
  from pdfminer.layout import LTTextContainer
3
  from tqdm import tqdm
4
  import re
5
- from zipfile import ZipFile
 
6
  import gradio as gr
7
  import os
8
  from llama_cpp import Llama
9
- from gpt4all import GPT4All
10
- import transformers
11
- # from transformers import GemmaTokenizer, AutoModelForCausalLM
12
- # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
13
- import accelerate
14
- import torch
15
 
16
  def process_document(pdf_path, page_ids=None):
17
 
@@ -112,33 +108,32 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top
112
  top_p=top_probability,
113
  )
114
  output = outputs[0]["generated_text"][len(prompt):]
115
-
116
  return output
117
 
 
 
 
 
118
  def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
119
- output_files = []
120
- for file in files:
121
- file_name = os.path.basename(file)
122
- file_name_splt = file_name.split('.')
123
- print('File name is ', file_name)
124
- if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
125
- page2content = process_document(file, page_ids=[0])
126
- pdftext = page2content[1]
127
- if(pdftext):
128
- anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
129
- # html = txt_to_html(display_text)
130
- # with open('out.html', "w", encoding="utf-8") as file:
131
- # file.write(html)
132
- with open(file_name_splt[0]+'.txt', 'w') as outputfile:
133
- # Write some text to the file
134
- outputfile.write(anonymized_text)
135
- output_files.append(file_name_splt[0]+'.txt')
136
- zipf = ZipFile('anonymized_reports', 'w')
137
- for file in output_files:
138
- zipf.write(file, os.path.basename(file))
139
- return 'anonymized_reports'
140
-
141
- # model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
142
  # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
143
 
144
  # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
 
2
  from pdfminer.layout import LTTextContainer
3
  from tqdm import tqdm
4
  import re
5
+ import io
6
+ import zipfile
7
  import gradio as gr
8
  import os
9
  from llama_cpp import Llama
10
+ import tempfile
 
 
 
 
 
11
 
12
  def process_document(pdf_path, page_ids=None):
13
 
 
108
  top_p=top_probability,
109
  )
110
  output = outputs[0]["generated_text"][len(prompt):]
111
+
112
  return output
113
 
114
+ def mkdir(dir):
115
+ if not os.path.exists(dir):
116
+ os.makedirs(dir)
117
+
118
  def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
119
+ zip_buffer = io.BytesIO()
120
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
121
+ for file in files:
122
+ file_name = os.path.basename(file)
123
+ file_name_splt = file_name.split('.')
124
+ if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
125
+ page2content = process_document(file, page_ids=[0])
126
+ pdftext = page2content[1]
127
+ if(pdftext):
128
+ anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
129
+ zf.writestr(file_name_splt[0]+'.txt', anonymized_text)
130
+ zip_buffer.seek(0)
131
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as temp_file:
132
+ temp_file.write(zip_buffer.getvalue())
133
+ temp_file_path = temp_file.name
134
+ return temp_file_path
135
+
136
+ # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
 
 
 
 
 
137
  # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
138
 
139
  # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')