Dionyssos commited on
Commit
a84b206
·
1 Parent(s): 560f712

soudscape discard last 1s from AudioGen - avoids splash sound

Browse files
api.py CHANGED
@@ -20,7 +20,7 @@ from audiocraft.builders import AudioGen
20
  CACHE_DIR = 'flask_cache/'
21
  NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
22
 
23
- sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
24
 
25
 
26
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
@@ -87,11 +87,11 @@ def overlay(x, soundscape=None):
87
  if soundscape is not None:
88
 
89
  # SOUNDS
90
- print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
91
  background = sound_generator.generate(
92
  [soundscape] * NUM_SOUND_GENERATIONS
93
- ).reshape(-1).detach().cpu().numpy() # bs, 11400
94
-
95
  # upsample 16 kHz AudioGen to 24kHZ StyleTTS
96
 
97
  print('Resampling')
@@ -100,20 +100,48 @@ def overlay(x, soundscape=None):
100
  background = audresample.resample(
101
  background,
102
  original_rate=16000, # sound_generator.sample_rate,
103
- target_rate=24000)[0, :]
104
 
105
  # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
106
 
107
- # replicat audiogen to match TTS
108
- n_repeat = len(x) // background.shape[0] + 2
109
 
110
- # Reach the full length of TTS by cloning
111
- print(f'Additional Repeat {n_repeat=}')
112
- background = np.concatenate(n_repeat * [background])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # background = _shift(background)
114
- print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
115
- f'{np.abs(background.max())=}\n{x.shape=}')
116
- x = .6 * x + .4 * background[:len(x)]
 
 
117
  else:
118
  print('sound_background = None')
119
  return x
 
20
  CACHE_DIR = 'flask_cache/'
21
  NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
22
 
23
+ sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
24
 
25
 
26
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 
87
  if soundscape is not None:
88
 
89
  # SOUNDS
90
+
91
  background = sound_generator.generate(
92
  [soundscape] * NUM_SOUND_GENERATIONS
93
+ ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
94
+ # sound_generator._flush() # ALREADY done in lm.generate() THE Encodec does not SEEM TO HAVE TRANSFORMERS thys no kvclean up kv cache from previous soundscape
95
  # upsample 16 kHz AudioGen to 24kHZ StyleTTS
96
 
97
  print('Resampling')
 
100
  background = audresample.resample(
101
  background,
102
  original_rate=16000, # sound_generator.sample_rate,
103
+ target_rate=24000)[0, :-25000] # discard last samples as they have the splash sound / polarity change;
104
 
105
  # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
106
 
 
 
107
 
108
+
109
+
110
+ k = background.shape[0]
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+ hop = int(.7 * k) # only overlap 10%
120
+ n_repeat = len(x) // hop
121
+ total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
122
+
123
+ m = np.ones(k)
124
+ overlap = k - hop
125
+ m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
126
+ # m[:overlap] = np.linspace(0, 1, overlap)
127
+
128
+ for j in range(n_repeat):
129
+ # total[j*k + hop:(j+1)*k + hop] += background
130
+ # total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
131
+ # total[j * (k+hop):(j+1) * k + j*hop] =background
132
+ total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
133
+ # total = total.clip(-1, 1) # if too many signals were added on top of each other
134
+ # print(total[40000:70000].tolist())
135
+ print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
136
+
137
+ # background = np.concatenate(n_repeat * [background])
138
+
139
  # background = _shift(background)
140
+ # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
141
+ # f'{np.abs(background.max())=}\n{x.shape=}')
142
+ total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
143
+ x = .4 * x + .6 * total[:len(x)]
144
+
145
  else:
146
  print('sound_background = None')
147
  return x
audiocraft/builders.py CHANGED
@@ -252,4 +252,8 @@ class AudioGen(nn.Module):
252
  model.load_state_dict(pkg['best_state'])
253
  model.cfg = cfg
254
  # return model
255
- self.lm = model.to(torch.float)
 
 
 
 
 
252
  model.load_state_dict(pkg['best_state'])
253
  model.cfg = cfg
254
  # return model
255
+ self.lm = model.to(torch.float)
256
+
257
+ # def _flush(self):
258
+ # self.lm._flush() # already done in lm generate at end
259
+
audiocraft/lm.py CHANGED
@@ -164,7 +164,7 @@ class LMModel(nn.Module):
164
  self.cfg_coef = cfg_coef
165
  self.condition_provider = condition_provider
166
  self.card = card # 2048 ?
167
- self.n_draw = 2 # replicate so many times the generation of each text in batch
168
  embed_dim = self.card + 1
169
  self.n_q = n_q
170
  self.dim = dim
 
164
  self.cfg_coef = cfg_coef
165
  self.condition_provider = condition_provider
166
  self.card = card # 2048 ?
167
+ self.n_draw = 1 # replicate so many times the generation of each text in batch
168
  embed_dim = self.card + 1
169
  self.n_q = n_q
170
  self.dim = dim
audiocraft/transformer.py CHANGED
@@ -175,7 +175,7 @@ class StreamingMultiheadAttention(nn.Module):
175
  v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
176
 
177
  q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
178
- print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
179
  else:
180
  # 1st projected makes k,v (instantaneous)
181
  # 2nd cat
@@ -213,7 +213,7 @@ class StreamingMultiheadAttention(nn.Module):
213
 
214
 
215
  # KV COMPLETION ONLY ON SELF ATTENTION
216
- print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
217
 
218
 
219
  if self.memory_efficient:
@@ -386,7 +386,7 @@ class StreamingTransformer(nn.Module):
386
 
387
 
388
  for j, lay in enumerate(self.layers):
389
- print(f'5_________________________{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
390
- x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # txt cond
391
  # each layer (mha) keeps history of its own k,v for all tokens
392
  return x
 
175
  v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
176
 
177
  q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
178
+ # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
179
  else:
180
  # 1st projected makes k,v (instantaneous)
181
  # 2nd cat
 
213
 
214
 
215
  # KV COMPLETION ONLY ON SELF ATTENTION
216
+ # print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
217
 
218
 
219
  if self.memory_efficient:
 
386
 
387
 
388
  for j, lay in enumerate(self.layers):
389
+ # print(f'Transf Layer{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
390
+ x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # cross_attention_src = txt-cond
391
  # each layer (mha) keeps history of its own k,v for all tokens
392
  return x