Update app.py
Browse files
app.py
CHANGED
@@ -87,26 +87,25 @@ def generate_response(messages, model, tokenizer):
|
|
87 |
add_generation_prompt=True,
|
88 |
)
|
89 |
|
90 |
-
# Tokenizer and model input preparation
|
91 |
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
92 |
|
93 |
-
#
|
|
|
|
|
|
|
94 |
with torch.no_grad():
|
95 |
-
# Ensure full precision by disabling autocast if necessary
|
96 |
-
# Assuming infer_process handles precision internally
|
97 |
generated_ids = model.generate(
|
98 |
input_ids=model_inputs.input_ids,
|
99 |
-
max_new_tokens=
|
100 |
temperature=0.5,
|
101 |
top_p=0.9,
|
102 |
-
do_sample=True,
|
103 |
-
repetition_penalty=1.2,
|
104 |
)
|
105 |
|
106 |
if not generated_ids:
|
107 |
raise ValueError("No generated IDs returned by the model.")
|
108 |
|
109 |
-
# Post-processing the generated IDs
|
110 |
generated_ids = [
|
111 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
112 |
]
|
@@ -114,7 +113,6 @@ def generate_response(messages, model, tokenizer):
|
|
114 |
if not generated_ids or not generated_ids[0]:
|
115 |
raise ValueError("Generated IDs are empty after processing.")
|
116 |
|
117 |
-
# Decode and return the response
|
118 |
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
119 |
|
120 |
def extract_metadata_and_cover(ebook_path):
|
@@ -135,26 +133,24 @@ def embed_cover_into_mp3(mp3_path, cover_image_path):
|
|
135 |
except error:
|
136 |
audio = ID3()
|
137 |
|
138 |
-
# Remove existing APIC frames to avoid duplicates
|
139 |
audio.delall("APIC")
|
140 |
|
141 |
try:
|
142 |
with open(cover_image_path, 'rb') as img:
|
143 |
audio.add(APIC(
|
144 |
-
encoding=3,
|
145 |
-
mime='image/jpeg',
|
146 |
-
type=3,
|
147 |
-
desc='Front cover',
|
148 |
data=img.read()
|
149 |
))
|
150 |
-
# Save with ID3v2.3 for better compatibility
|
151 |
audio.save(mp3_path, v2_version=3)
|
152 |
print(f"Embedded cover image into {mp3_path}")
|
153 |
except Exception as e:
|
154 |
print(f"Failed to embed cover image into MP3: {e}")
|
155 |
|
156 |
def extract_text_and_title_from_epub(epub_path):
|
157 |
-
"""Extract text and title from an EPUB file."""
|
158 |
try:
|
159 |
book = epub.read_epub(epub_path)
|
160 |
print(f"EPUB '{epub_path}' successfully read.")
|
@@ -176,15 +172,15 @@ def extract_text_and_title_from_epub(epub_path):
|
|
176 |
title = os.path.splitext(os.path.basename(epub_path))[0]
|
177 |
print(f"Using filename as title: {title}")
|
178 |
|
179 |
-
|
180 |
-
|
|
|
|
|
181 |
try:
|
182 |
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
183 |
text = soup.get_text(separator=' ', strip=True)
|
184 |
if text:
|
185 |
text_content.append(text)
|
186 |
-
else:
|
187 |
-
print(f"No text in document item {item.get_id()}.")
|
188 |
except Exception as e:
|
189 |
print(f"Error parsing document item {item.get_id()}: {e}")
|
190 |
|
@@ -242,7 +238,7 @@ def show_converted_audiobooks():
|
|
242 |
|
243 |
@gpu_decorator
|
244 |
def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1, show_info=gr.Info, progress=gr.Progress()):
|
245 |
-
"""Perform inference to generate audio from text."""
|
246 |
try:
|
247 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
248 |
except Exception as e:
|
@@ -252,7 +248,6 @@ def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1,
|
|
252 |
raise ValueError("Generated text is empty. Please provide valid text content.")
|
253 |
|
254 |
try:
|
255 |
-
# Ensure inference is on the correct device
|
256 |
with torch.no_grad():
|
257 |
final_wave, final_sample_rate, _ = infer_process(
|
258 |
ref_audio,
|
@@ -263,16 +258,19 @@ def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1,
|
|
263 |
cross_fade_duration=cross_fade_duration,
|
264 |
speed=speed,
|
265 |
show_info=show_info,
|
266 |
-
progress=progress,
|
267 |
)
|
268 |
except Exception as e:
|
269 |
raise RuntimeError(f"Error during inference process: {e}")
|
270 |
|
|
|
|
|
|
|
271 |
return (final_sample_rate, final_wave), ref_text
|
272 |
|
273 |
@gpu_decorator
|
274 |
def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_duration, speed, progress=gr.Progress()):
|
275 |
-
"""Main function to convert eBooks to audiobooks."""
|
276 |
try:
|
277 |
processed_audiobooks = []
|
278 |
num_ebooks = len(gen_file_input)
|
@@ -302,14 +300,13 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
|
|
302 |
gen_text,
|
303 |
cross_fade_duration,
|
304 |
speed,
|
305 |
-
progress=progress,
|
306 |
)
|
307 |
|
308 |
progress(0.8, desc="Stitching audio files")
|
309 |
sample_rate, wave = audio_out
|
310 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
|
311 |
-
|
312 |
-
sf.write(tmp_wav.name, wave, sample_rate, subtype='PCM_24')
|
313 |
tmp_wav_path = tmp_wav.name
|
314 |
|
315 |
progress(0.9, desc="Converting to MP3")
|
@@ -317,21 +314,17 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
|
|
317 |
tmp_mp3_path = os.path.join("Working_files", "Book", f"{sanitized_title}.mp3")
|
318 |
ensure_directory(os.path.dirname(tmp_mp3_path))
|
319 |
|
320 |
-
# Load WAV with Pydub
|
321 |
audio = AudioSegment.from_wav(tmp_wav_path)
|
322 |
-
|
323 |
-
# Export to MP3 with higher bitrate and quality settings
|
324 |
audio.export(
|
325 |
tmp_mp3_path,
|
326 |
format="mp3",
|
327 |
bitrate="320k",
|
328 |
-
parameters=["-q:a", "0"]
|
329 |
)
|
330 |
|
331 |
if cover_image:
|
332 |
embed_cover_into_mp3(tmp_mp3_path, cover_image)
|
333 |
|
334 |
-
# Clean up temporary files
|
335 |
os.remove(tmp_wav_path)
|
336 |
if cover_image and os.path.exists(cover_image):
|
337 |
os.remove(cover_image)
|
@@ -339,9 +332,7 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
|
|
339 |
processed_audiobooks.append(tmp_mp3_path)
|
340 |
progress(1, desc=f"Completed processing ebook {idx+1}/{num_ebooks}")
|
341 |
|
342 |
-
|
343 |
-
player_audio = tmp_mp3_path # Path to the latest audio file
|
344 |
-
yield player_audio, processed_audiobooks # Yield the updated outputs
|
345 |
|
346 |
except Exception as e:
|
347 |
print(f"An error occurred: {e}")
|
@@ -364,7 +355,6 @@ def create_gradio_app():
|
|
364 |
file_count="multiple",
|
365 |
)
|
366 |
|
367 |
-
# Arrange the two buttons side by side using gr.Row
|
368 |
with gr.Row():
|
369 |
generate_btn = gr.Button("Start", variant="primary")
|
370 |
show_audiobooks_btn = gr.Button("Show All Completed Audiobooks", variant="secondary")
|
@@ -402,7 +392,7 @@ def create_gradio_app():
|
|
402 |
speed_slider,
|
403 |
],
|
404 |
outputs=[player, audiobooks_output],
|
405 |
-
show_progress=True,
|
406 |
)
|
407 |
|
408 |
show_audiobooks_btn.click(
|
|
|
87 |
add_generation_prompt=True,
|
88 |
)
|
89 |
|
|
|
90 |
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
91 |
|
92 |
+
# Increase max_new_tokens to a much larger number to avoid truncation
|
93 |
+
# Previously: max_new_tokens=1024
|
94 |
+
max_new_tokens = 1000000 # Large number to allow full generation
|
95 |
+
|
96 |
with torch.no_grad():
|
|
|
|
|
97 |
generated_ids = model.generate(
|
98 |
input_ids=model_inputs.input_ids,
|
99 |
+
max_new_tokens=max_new_tokens,
|
100 |
temperature=0.5,
|
101 |
top_p=0.9,
|
102 |
+
do_sample=True,
|
103 |
+
repetition_penalty=1.2,
|
104 |
)
|
105 |
|
106 |
if not generated_ids:
|
107 |
raise ValueError("No generated IDs returned by the model.")
|
108 |
|
|
|
109 |
generated_ids = [
|
110 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
111 |
]
|
|
|
113 |
if not generated_ids or not generated_ids[0]:
|
114 |
raise ValueError("Generated IDs are empty after processing.")
|
115 |
|
|
|
116 |
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
117 |
|
118 |
def extract_metadata_and_cover(ebook_path):
|
|
|
133 |
except error:
|
134 |
audio = ID3()
|
135 |
|
|
|
136 |
audio.delall("APIC")
|
137 |
|
138 |
try:
|
139 |
with open(cover_image_path, 'rb') as img:
|
140 |
audio.add(APIC(
|
141 |
+
encoding=3,
|
142 |
+
mime='image/jpeg',
|
143 |
+
type=3,
|
144 |
+
desc='Front cover',
|
145 |
data=img.read()
|
146 |
))
|
|
|
147 |
audio.save(mp3_path, v2_version=3)
|
148 |
print(f"Embedded cover image into {mp3_path}")
|
149 |
except Exception as e:
|
150 |
print(f"Failed to embed cover image into MP3: {e}")
|
151 |
|
152 |
def extract_text_and_title_from_epub(epub_path):
|
153 |
+
"""Extract full text and title from an EPUB file in reading order."""
|
154 |
try:
|
155 |
book = epub.read_epub(epub_path)
|
156 |
print(f"EPUB '{epub_path}' successfully read.")
|
|
|
172 |
title = os.path.splitext(os.path.basename(epub_path))[0]
|
173 |
print(f"Using filename as title: {title}")
|
174 |
|
175 |
+
# Iterate over the book's spine in reading order
|
176 |
+
for spine_item in book.spine:
|
177 |
+
item = book.get_item_with_id(spine_item[0])
|
178 |
+
if item and item.get_type() == ITEM_DOCUMENT:
|
179 |
try:
|
180 |
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
181 |
text = soup.get_text(separator=' ', strip=True)
|
182 |
if text:
|
183 |
text_content.append(text)
|
|
|
|
|
184 |
except Exception as e:
|
185 |
print(f"Error parsing document item {item.get_id()}: {e}")
|
186 |
|
|
|
238 |
|
239 |
@gpu_decorator
|
240 |
def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1, show_info=gr.Info, progress=gr.Progress()):
|
241 |
+
"""Perform inference to generate audio from text without truncation."""
|
242 |
try:
|
243 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
244 |
except Exception as e:
|
|
|
248 |
raise ValueError("Generated text is empty. Please provide valid text content.")
|
249 |
|
250 |
try:
|
|
|
251 |
with torch.no_grad():
|
252 |
final_wave, final_sample_rate, _ = infer_process(
|
253 |
ref_audio,
|
|
|
258 |
cross_fade_duration=cross_fade_duration,
|
259 |
speed=speed,
|
260 |
show_info=show_info,
|
261 |
+
progress=progress,
|
262 |
)
|
263 |
except Exception as e:
|
264 |
raise RuntimeError(f"Error during inference process: {e}")
|
265 |
|
266 |
+
# Log the length of the generated audio for debugging
|
267 |
+
print(f"Generated audio length: {len(final_wave)} samples at {final_sample_rate} Hz.")
|
268 |
+
|
269 |
return (final_sample_rate, final_wave), ref_text
|
270 |
|
271 |
@gpu_decorator
|
272 |
def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_duration, speed, progress=gr.Progress()):
|
273 |
+
"""Main function to convert eBooks to audiobooks with full text processing."""
|
274 |
try:
|
275 |
processed_audiobooks = []
|
276 |
num_ebooks = len(gen_file_input)
|
|
|
300 |
gen_text,
|
301 |
cross_fade_duration,
|
302 |
speed,
|
303 |
+
progress=progress,
|
304 |
)
|
305 |
|
306 |
progress(0.8, desc="Stitching audio files")
|
307 |
sample_rate, wave = audio_out
|
308 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
|
309 |
+
sf.write(tmp_wav.name, wave, sample_rate)
|
|
|
310 |
tmp_wav_path = tmp_wav.name
|
311 |
|
312 |
progress(0.9, desc="Converting to MP3")
|
|
|
314 |
tmp_mp3_path = os.path.join("Working_files", "Book", f"{sanitized_title}.mp3")
|
315 |
ensure_directory(os.path.dirname(tmp_mp3_path))
|
316 |
|
|
|
317 |
audio = AudioSegment.from_wav(tmp_wav_path)
|
|
|
|
|
318 |
audio.export(
|
319 |
tmp_mp3_path,
|
320 |
format="mp3",
|
321 |
bitrate="320k",
|
322 |
+
parameters=["-q:a", "0"]
|
323 |
)
|
324 |
|
325 |
if cover_image:
|
326 |
embed_cover_into_mp3(tmp_mp3_path, cover_image)
|
327 |
|
|
|
328 |
os.remove(tmp_wav_path)
|
329 |
if cover_image and os.path.exists(cover_image):
|
330 |
os.remove(cover_image)
|
|
|
332 |
processed_audiobooks.append(tmp_mp3_path)
|
333 |
progress(1, desc=f"Completed processing ebook {idx+1}/{num_ebooks}")
|
334 |
|
335 |
+
yield tmp_mp3_path, processed_audiobooks
|
|
|
|
|
336 |
|
337 |
except Exception as e:
|
338 |
print(f"An error occurred: {e}")
|
|
|
355 |
file_count="multiple",
|
356 |
)
|
357 |
|
|
|
358 |
with gr.Row():
|
359 |
generate_btn = gr.Button("Start", variant="primary")
|
360 |
show_audiobooks_btn = gr.Button("Show All Completed Audiobooks", variant="secondary")
|
|
|
392 |
speed_slider,
|
393 |
],
|
394 |
outputs=[player, audiobooks_output],
|
395 |
+
show_progress=True,
|
396 |
)
|
397 |
|
398 |
show_audiobooks_btn.click(
|