vilarin commited on
Commit
e740e32
·
verified ·
1 Parent(s): 6ec9fb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
6
  import os
7
  from threading import Thread
8
 
9
- from langchain_community.document_loaders import PyMuPDFLoader
10
  import docx
11
  from pptx import Presentation
12
 
@@ -56,11 +56,11 @@ def extract_text(path):
56
  return open(path, 'r').read()
57
 
58
  def extract_pdf(path):
59
- loader = PyMuPDFLoader(path)
60
- data = loader.load()
61
- data = [x.page_content for x in data]
62
- content = '\n\n'.join(data)
63
- return content
64
 
65
  def extract_docx(path):
66
  doc = docx.Document(path)
@@ -68,6 +68,7 @@ def extract_docx(path):
68
  for paragraph in doc.paragraphs:
69
  data.append(paragraph.text)
70
  content = '\n\n'.join(data)
 
71
 
72
  def extract_pptx(path):
73
  prs = Presentation(path)
@@ -91,8 +92,8 @@ def mode_load(path):
91
  else:
92
  content = extract_text(path)
93
  choice = "doc"
94
- print(content)
95
- return choice, content
96
  elif file_type in ["png", "jpg", "jpeg", "bmp", "tiff", "webp"]:
97
  content = Image.open(path).convert('RGB')
98
  choice = "image"
 
6
  import os
7
  from threading import Thread
8
 
9
+ import fitz
10
  import docx
11
  from pptx import Presentation
12
 
 
56
  return open(path, 'r').read()
57
 
58
  def extract_pdf(path):
59
+ doc = fitz.open(path)
60
+ text = ""
61
+ for page in doc:
62
+ text += page.get_text()
63
+ return text
64
 
65
  def extract_docx(path):
66
  doc = docx.Document(path)
 
68
  for paragraph in doc.paragraphs:
69
  data.append(paragraph.text)
70
  content = '\n\n'.join(data)
71
+ return content
72
 
73
  def extract_pptx(path):
74
  prs = Presentation(path)
 
92
  else:
93
  content = extract_text(path)
94
  choice = "doc"
95
+ print(content[:100])
96
+ return choice, content[:5000]
97
  elif file_type in ["png", "jpg", "jpeg", "bmp", "tiff", "webp"]:
98
  content = Image.open(path).convert('RGB')
99
  choice = "image"