Add variable audio lengths

#3
by ylacombe - opened
Files changed (1) hide show
  1. app.py +6 -4
app.py CHANGED
@@ -30,7 +30,7 @@ model = ParlerTTSForConditionalGeneration.from_pretrained(
30
  client = InferenceClient()
31
 
32
  description_tokenizer = AutoTokenizer.from_pretrained(repo_id)
33
- prompt_tokenizer = AutoTokenizer.from_pretrained(repo_id)
34
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
35
 
36
  SAMPLE_RATE = feature_extractor.sampling_rate
@@ -78,7 +78,7 @@ def generate_story(subject: str, setting: str) -> str:
78
  return None, None, story
79
 
80
 
81
- @spaces.GPU
82
  def generate_base(story):
83
 
84
 
@@ -95,8 +95,10 @@ def generate_base(story):
95
  speech_output = model.generate(input_ids=description_tokens.input_ids,
96
  prompt_input_ids=story_tokens.input_ids,
97
  attention_mask=description_tokens.attention_mask,
98
- prompt_attention_mask=story_tokens.attention_mask)
99
- speech_output = [output.cpu().numpy() for output in speech_output]
 
 
100
  return None, None, speech_output
101
 
102
 
 
30
  client = InferenceClient()
31
 
32
  description_tokenizer = AutoTokenizer.from_pretrained(repo_id)
33
+ prompt_tokenizer = AutoTokenizer.from_pretrained(repo_id, padding_side="left")
34
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
35
 
36
  SAMPLE_RATE = feature_extractor.sampling_rate
 
78
  return None, None, story
79
 
80
 
81
+ @spaces.GPU(duration=120)
82
  def generate_base(story):
83
 
84
 
 
95
  speech_output = model.generate(input_ids=description_tokens.input_ids,
96
  prompt_input_ids=story_tokens.input_ids,
97
  attention_mask=description_tokens.attention_mask,
98
+ prompt_attention_mask=story_tokens.attention_mask,
99
+ return_dict_in_generate=True,
100
+ )
101
+ speech_output = [output.cpu().numpy()[:output_length] for (output, output_length) in zip(speech_output.sequences, speech_output.audios_length)]
102
  return None, None, speech_output
103
 
104