Spaces:
Runtime error
Runtime error
ShadowDominator
commited on
Commit
·
febd231
1
Parent(s):
83cbf17
Upload 4 files
Browse files- .gitattributes +1 -0
- 1706.03762.pdf +3 -0
- app.py +108 -0
- packages.txt +1 -0
- requirements.txt +2 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
1706.03762.pdf filter=lfs diff=lfs merge=lfs -text
|
1706.03762.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfaaec89262875f927cf1b38b2da2d775f3309b7bea3537f29b606ca67e79065
|
3 |
+
size 2201700
|
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from pdf2image import convert_from_path,pdfinfo_from_path
|
4 |
+
import zipfile
|
5 |
+
|
6 |
+
def zip_folder(folder_path, output_path):
|
7 |
+
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
8 |
+
for root, dirs, files in os.walk(folder_path):
|
9 |
+
for file in files:
|
10 |
+
file_path = os.path.join(root, file)
|
11 |
+
zipf.write(file_path, os.path.relpath(file_path, folder_path))
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
DIRECTORY = "image_reference"
|
16 |
+
DIRECTORY_OUTPUT = "output"
|
17 |
+
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]
|
18 |
+
|
19 |
+
# Check and create directories
|
20 |
+
for directory in DIRECTORIES:
|
21 |
+
if not os.path.exists(directory):
|
22 |
+
os.makedirs(directory)
|
23 |
+
else:
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']
|
28 |
+
def get_image_files(directory):
|
29 |
+
image_files = []
|
30 |
+
for filename in os.listdir(directory):
|
31 |
+
if filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)):
|
32 |
+
filepath = os.path.join(directory, filename)
|
33 |
+
image_files.append(filepath)
|
34 |
+
return image_files
|
35 |
+
|
36 |
+
|
37 |
+
def clear_directory(directory):
|
38 |
+
for filename in os.listdir(directory):
|
39 |
+
file_path = os.path.join(directory, filename)
|
40 |
+
try:
|
41 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
42 |
+
os.unlink(file_path)
|
43 |
+
elif os.path.isdir(file_path):
|
44 |
+
os.rmdir(file_path)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Failed to delete {file_path}. Reason: {e}")
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
def extract_photos_from_pdf(file_pdf):
|
51 |
+
clear_directory(DIRECTORY)
|
52 |
+
clear_directory(DIRECTORY_OUTPUT)
|
53 |
+
try:
|
54 |
+
pdf_path = file_pdf.name
|
55 |
+
info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
|
56 |
+
|
57 |
+
total_pages = info["Pages"] # Total number of pages in the PDF book
|
58 |
+
batch_size = 100 # Number of pages to process in each batch
|
59 |
+
|
60 |
+
for start_page in range(0, total_pages, batch_size):
|
61 |
+
end_page = min(start_page + batch_size, total_pages)
|
62 |
+
images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
|
63 |
+
for idx, image in enumerate(images, start=start_page):
|
64 |
+
image.save(f'{DIRECTORY}/{idx+1}.png', 'PNG')
|
65 |
+
|
66 |
+
images_pdf_list = get_image_files(DIRECTORY)
|
67 |
+
image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
|
68 |
+
sorted_names = sorted(image_names, key=lambda x: int(x[1].split('.')[0]))
|
69 |
+
zip_folder(DIRECTORY, f'{DIRECTORY_OUTPUT}/all_photos.zip')
|
70 |
+
return (
|
71 |
+
gr.Gallery.update(value=sorted_names, label=f"Detected {len(images_pdf_list)} Page{'' if len(images_pdf_list) == 1 else 's'}", show_label=True, visible=True),
|
72 |
+
gr.File.update(value=f'{DIRECTORY_OUTPUT}/all_photos.zip',visible=True)
|
73 |
+
)
|
74 |
+
except:
|
75 |
+
return (
|
76 |
+
gr.Gallery.update(value=[], label="Error", show_label=True, visible=True),
|
77 |
+
gr.File.update(visible=False)
|
78 |
+
)
|
79 |
+
|
80 |
+
with gr.Blocks() as demo:
|
81 |
+
with gr.Tabs() as tabs:
|
82 |
+
|
83 |
+
with gr.TabItem("PDF",id=0):
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
with gr.Column():
|
87 |
+
proegres = gr.Text(show_label=False,value="",visible=False)
|
88 |
+
file_pdf = gr.File(file_types=['.pdf'], label="Upload PDF *")
|
89 |
+
btn = gr.Button("Extract Photos from PDF")
|
90 |
+
|
91 |
+
|
92 |
+
with gr.Tabs(visible=True) as tabs_under:
|
93 |
+
|
94 |
+
with gr.TabItem("Photos",id=0):
|
95 |
+
|
96 |
+
with gr.Column():
|
97 |
+
|
98 |
+
list_image = gr.Gallery(value=[], label=f"0 Page",visible=True, show_label=True, elem_id="gallery").style(columns=[3], object_fit="cover", height="auto")
|
99 |
+
file_download = gr.File(file_types=['.zip'], label="Download File",visible=False)
|
100 |
+
|
101 |
+
|
102 |
+
examples = gr.Examples([["./1706.03762.pdf", None]], fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download], cache_examples=False)
|
103 |
+
btn.click(fn=extract_photos_from_pdf,inputs=[file_pdf],outputs=[list_image,file_download])
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
demo.queue().launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
poppler-utils
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio==3.32.0
|
2 |
+
pdf2image==1.16.3
|