jdana commited on
Commit
17f027d
·
verified ·
1 Parent(s): 16e76af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -37
app.py CHANGED
@@ -87,26 +87,25 @@ def generate_response(messages, model, tokenizer):
87
  add_generation_prompt=True,
88
  )
89
 
90
- # Tokenizer and model input preparation
91
  model_inputs = tokenizer([text], return_tensors="pt").to(device)
92
 
93
- # Use full precision for higher audio quality
 
 
 
94
  with torch.no_grad():
95
- # Ensure full precision by disabling autocast if necessary
96
- # Assuming infer_process handles precision internally
97
  generated_ids = model.generate(
98
  input_ids=model_inputs.input_ids,
99
- max_new_tokens=2048,
100
  temperature=0.5,
101
  top_p=0.9,
102
- do_sample=True, # Enable sampling for more natural responses
103
- repetition_penalty=1.2, # Prevent repetition
104
  )
105
 
106
  if not generated_ids:
107
  raise ValueError("No generated IDs returned by the model.")
108
 
109
- # Post-processing the generated IDs
110
  generated_ids = [
111
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
112
  ]
@@ -114,7 +113,6 @@ def generate_response(messages, model, tokenizer):
114
  if not generated_ids or not generated_ids[0]:
115
  raise ValueError("Generated IDs are empty after processing.")
116
 
117
- # Decode and return the response
118
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
119
 
120
  def extract_metadata_and_cover(ebook_path):
@@ -135,26 +133,24 @@ def embed_cover_into_mp3(mp3_path, cover_image_path):
135
  except error:
136
  audio = ID3()
137
 
138
- # Remove existing APIC frames to avoid duplicates
139
  audio.delall("APIC")
140
 
141
  try:
142
  with open(cover_image_path, 'rb') as img:
143
  audio.add(APIC(
144
- encoding=3, # 3 is for UTF-8
145
- mime='image/jpeg', # Image MIME type
146
- type=3, # 3 is for front cover
147
- desc='Front cover', # Description
148
  data=img.read()
149
  ))
150
- # Save with ID3v2.3 for better compatibility
151
  audio.save(mp3_path, v2_version=3)
152
  print(f"Embedded cover image into {mp3_path}")
153
  except Exception as e:
154
  print(f"Failed to embed cover image into MP3: {e}")
155
 
156
  def extract_text_and_title_from_epub(epub_path):
157
- """Extract text and title from an EPUB file."""
158
  try:
159
  book = epub.read_epub(epub_path)
160
  print(f"EPUB '{epub_path}' successfully read.")
@@ -176,15 +172,15 @@ def extract_text_and_title_from_epub(epub_path):
176
  title = os.path.splitext(os.path.basename(epub_path))[0]
177
  print(f"Using filename as title: {title}")
178
 
179
- for item in book.get_items():
180
- if item.get_type() == ITEM_DOCUMENT:
 
 
181
  try:
182
  soup = BeautifulSoup(item.get_content(), 'html.parser')
183
  text = soup.get_text(separator=' ', strip=True)
184
  if text:
185
  text_content.append(text)
186
- else:
187
- print(f"No text in document item {item.get_id()}.")
188
  except Exception as e:
189
  print(f"Error parsing document item {item.get_id()}: {e}")
190
 
@@ -242,7 +238,7 @@ def show_converted_audiobooks():
242
 
243
  @gpu_decorator
244
  def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1, show_info=gr.Info, progress=gr.Progress()):
245
- """Perform inference to generate audio from text."""
246
  try:
247
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
248
  except Exception as e:
@@ -252,7 +248,6 @@ def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1,
252
  raise ValueError("Generated text is empty. Please provide valid text content.")
253
 
254
  try:
255
- # Ensure inference is on the correct device
256
  with torch.no_grad():
257
  final_wave, final_sample_rate, _ = infer_process(
258
  ref_audio,
@@ -263,16 +258,19 @@ def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1,
263
  cross_fade_duration=cross_fade_duration,
264
  speed=speed,
265
  show_info=show_info,
266
- progress=progress, # Pass progress here
267
  )
268
  except Exception as e:
269
  raise RuntimeError(f"Error during inference process: {e}")
270
 
 
 
 
271
  return (final_sample_rate, final_wave), ref_text
272
 
273
  @gpu_decorator
274
  def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_duration, speed, progress=gr.Progress()):
275
- """Main function to convert eBooks to audiobooks."""
276
  try:
277
  processed_audiobooks = []
278
  num_ebooks = len(gen_file_input)
@@ -302,14 +300,13 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
302
  gen_text,
303
  cross_fade_duration,
304
  speed,
305
- progress=progress, # Pass progress here
306
  )
307
 
308
  progress(0.8, desc="Stitching audio files")
309
  sample_rate, wave = audio_out
310
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
311
- # Save WAV with higher bit depth and sample rate if possible
312
- sf.write(tmp_wav.name, wave, sample_rate, subtype='PCM_24')
313
  tmp_wav_path = tmp_wav.name
314
 
315
  progress(0.9, desc="Converting to MP3")
@@ -317,21 +314,17 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
317
  tmp_mp3_path = os.path.join("Working_files", "Book", f"{sanitized_title}.mp3")
318
  ensure_directory(os.path.dirname(tmp_mp3_path))
319
 
320
- # Load WAV with Pydub
321
  audio = AudioSegment.from_wav(tmp_wav_path)
322
-
323
- # Export to MP3 with higher bitrate and quality settings
324
  audio.export(
325
  tmp_mp3_path,
326
  format="mp3",
327
  bitrate="320k",
328
- parameters=["-q:a", "0"] # Highest quality for VBR
329
  )
330
 
331
  if cover_image:
332
  embed_cover_into_mp3(tmp_mp3_path, cover_image)
333
 
334
- # Clean up temporary files
335
  os.remove(tmp_wav_path)
336
  if cover_image and os.path.exists(cover_image):
337
  os.remove(cover_image)
@@ -339,9 +332,7 @@ def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_durati
339
  processed_audiobooks.append(tmp_mp3_path)
340
  progress(1, desc=f"Completed processing ebook {idx+1}/{num_ebooks}")
341
 
342
- # Yield the outputs after processing each ebook
343
- player_audio = tmp_mp3_path # Path to the latest audio file
344
- yield player_audio, processed_audiobooks # Yield the updated outputs
345
 
346
  except Exception as e:
347
  print(f"An error occurred: {e}")
@@ -364,7 +355,6 @@ def create_gradio_app():
364
  file_count="multiple",
365
  )
366
 
367
- # Arrange the two buttons side by side using gr.Row
368
  with gr.Row():
369
  generate_btn = gr.Button("Start", variant="primary")
370
  show_audiobooks_btn = gr.Button("Show All Completed Audiobooks", variant="secondary")
@@ -402,7 +392,7 @@ def create_gradio_app():
402
  speed_slider,
403
  ],
404
  outputs=[player, audiobooks_output],
405
- show_progress=True, # Enable progress bar
406
  )
407
 
408
  show_audiobooks_btn.click(
 
87
  add_generation_prompt=True,
88
  )
89
 
 
90
  model_inputs = tokenizer([text], return_tensors="pt").to(device)
91
 
92
+ # Increase max_new_tokens to a much larger number to avoid truncation
93
+ # Previously: max_new_tokens=1024
94
+ max_new_tokens = 1000000 # Large number to allow full generation
95
+
96
  with torch.no_grad():
 
 
97
  generated_ids = model.generate(
98
  input_ids=model_inputs.input_ids,
99
+ max_new_tokens=max_new_tokens,
100
  temperature=0.5,
101
  top_p=0.9,
102
+ do_sample=True,
103
+ repetition_penalty=1.2,
104
  )
105
 
106
  if not generated_ids:
107
  raise ValueError("No generated IDs returned by the model.")
108
 
 
109
  generated_ids = [
110
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
111
  ]
 
113
  if not generated_ids or not generated_ids[0]:
114
  raise ValueError("Generated IDs are empty after processing.")
115
 
 
116
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
117
 
118
  def extract_metadata_and_cover(ebook_path):
 
133
  except error:
134
  audio = ID3()
135
 
 
136
  audio.delall("APIC")
137
 
138
  try:
139
  with open(cover_image_path, 'rb') as img:
140
  audio.add(APIC(
141
+ encoding=3,
142
+ mime='image/jpeg',
143
+ type=3,
144
+ desc='Front cover',
145
  data=img.read()
146
  ))
 
147
  audio.save(mp3_path, v2_version=3)
148
  print(f"Embedded cover image into {mp3_path}")
149
  except Exception as e:
150
  print(f"Failed to embed cover image into MP3: {e}")
151
 
152
  def extract_text_and_title_from_epub(epub_path):
153
+ """Extract full text and title from an EPUB file in reading order."""
154
  try:
155
  book = epub.read_epub(epub_path)
156
  print(f"EPUB '{epub_path}' successfully read.")
 
172
  title = os.path.splitext(os.path.basename(epub_path))[0]
173
  print(f"Using filename as title: {title}")
174
 
175
+ # Iterate over the book's spine in reading order
176
+ for spine_item in book.spine:
177
+ item = book.get_item_with_id(spine_item[0])
178
+ if item and item.get_type() == ITEM_DOCUMENT:
179
  try:
180
  soup = BeautifulSoup(item.get_content(), 'html.parser')
181
  text = soup.get_text(separator=' ', strip=True)
182
  if text:
183
  text_content.append(text)
 
 
184
  except Exception as e:
185
  print(f"Error parsing document item {item.get_id()}: {e}")
186
 
 
238
 
239
  @gpu_decorator
240
  def infer(ref_audio_orig, ref_text, gen_text, cross_fade_duration=0.0, speed=1, show_info=gr.Info, progress=gr.Progress()):
241
+ """Perform inference to generate audio from text without truncation."""
242
  try:
243
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
244
  except Exception as e:
 
248
  raise ValueError("Generated text is empty. Please provide valid text content.")
249
 
250
  try:
 
251
  with torch.no_grad():
252
  final_wave, final_sample_rate, _ = infer_process(
253
  ref_audio,
 
258
  cross_fade_duration=cross_fade_duration,
259
  speed=speed,
260
  show_info=show_info,
261
+ progress=progress,
262
  )
263
  except Exception as e:
264
  raise RuntimeError(f"Error during inference process: {e}")
265
 
266
+ # Log the length of the generated audio for debugging
267
+ print(f"Generated audio length: {len(final_wave)} samples at {final_sample_rate} Hz.")
268
+
269
  return (final_sample_rate, final_wave), ref_text
270
 
271
  @gpu_decorator
272
  def basic_tts(ref_audio_input, ref_text_input, gen_file_input, cross_fade_duration, speed, progress=gr.Progress()):
273
+ """Main function to convert eBooks to audiobooks with full text processing."""
274
  try:
275
  processed_audiobooks = []
276
  num_ebooks = len(gen_file_input)
 
300
  gen_text,
301
  cross_fade_duration,
302
  speed,
303
+ progress=progress,
304
  )
305
 
306
  progress(0.8, desc="Stitching audio files")
307
  sample_rate, wave = audio_out
308
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_wav:
309
+ sf.write(tmp_wav.name, wave, sample_rate)
 
310
  tmp_wav_path = tmp_wav.name
311
 
312
  progress(0.9, desc="Converting to MP3")
 
314
  tmp_mp3_path = os.path.join("Working_files", "Book", f"{sanitized_title}.mp3")
315
  ensure_directory(os.path.dirname(tmp_mp3_path))
316
 
 
317
  audio = AudioSegment.from_wav(tmp_wav_path)
 
 
318
  audio.export(
319
  tmp_mp3_path,
320
  format="mp3",
321
  bitrate="320k",
322
+ parameters=["-q:a", "0"]
323
  )
324
 
325
  if cover_image:
326
  embed_cover_into_mp3(tmp_mp3_path, cover_image)
327
 
 
328
  os.remove(tmp_wav_path)
329
  if cover_image and os.path.exists(cover_image):
330
  os.remove(cover_image)
 
332
  processed_audiobooks.append(tmp_mp3_path)
333
  progress(1, desc=f"Completed processing ebook {idx+1}/{num_ebooks}")
334
 
335
+ yield tmp_mp3_path, processed_audiobooks
 
 
336
 
337
  except Exception as e:
338
  print(f"An error occurred: {e}")
 
355
  file_count="multiple",
356
  )
357
 
 
358
  with gr.Row():
359
  generate_btn = gr.Button("Start", variant="primary")
360
  show_audiobooks_btn = gr.Button("Show All Completed Audiobooks", variant="secondary")
 
392
  speed_slider,
393
  ],
394
  outputs=[player, audiobooks_output],
395
+ show_progress=True,
396
  )
397
 
398
  show_audiobooks_btn.click(