Kr08 commited on
Commit
b815c4a
·
verified ·
1 Parent(s): aa348cd

Generate french texts for now

Browse files

First transcription commit
To Do:
1. Predict language from files.
2. Add audio player with temporally fused text.

Files changed (1) hide show
  1. app.py +37 -7
app.py CHANGED
@@ -1,11 +1,25 @@
1
- import torchaudio as ta
2
  import streamlit as st
 
3
 
4
  from io import BytesIO
5
- from transformers import AutoProcessor, SeamlessM4TModel
 
 
 
 
 
 
 
6
 
7
- processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium", use_fast=False)
8
- model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
 
 
 
 
 
 
9
 
10
  # Title of the app
11
  st.title("Audio Player with Live Transcription")
@@ -30,12 +44,12 @@ submit_button = st.sidebar.button("Submit")
30
  # return f"Could not request results; {e}"
31
 
32
 
33
- if submit_button and uploaded_files:
34
  st.write("Files uploaded successfully!")
35
 
36
  for uploaded_file in uploaded_files:
37
  # Display file name and audio player
38
- print(uploaded_file)
39
  st.write(f"**File name**: {uploaded_file.name}")
40
  st.audio(uploaded_file, format=uploaded_file.type)
41
 
@@ -44,8 +58,24 @@ if submit_button and uploaded_files:
44
 
45
  # Read the uploaded file data
46
  waveform, sampling_rate = ta.load(uploaded_file.getvalue())
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # Run transcription function and display
49
  # import pdb;pdb.set_trace()
50
  # st.write(audio_data.getvalue())
51
-
 
1
+ import torch
2
  import streamlit as st
3
+ import torchaudio as ta
4
 
5
  from io import BytesIO
6
+ from transformers import AutoProcessor, SeamlessM4TModel, WhisperProcessor, WhisperForConditionalGeneration
7
+
8
+ if torch.cuda.is_available():
9
+ device = "cuda:0"
10
+ torch_dtype = torch.float16
11
+ else:
12
+ device = "cpu"
13
+ torch_dtype = torch.float32
14
 
15
+ SAMPLING_RATE=16000
16
+ task = "transcribe"
17
+
18
+ print(f"{device} Active!")
19
+
20
+ # load Whisper model and processor
21
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
22
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
23
 
24
  # Title of the app
25
  st.title("Audio Player with Live Transcription")
 
44
  # return f"Could not request results; {e}"
45
 
46
 
47
+ if submit_button and uploaded_files is not None:
48
  st.write("Files uploaded successfully!")
49
 
50
  for uploaded_file in uploaded_files:
51
  # Display file name and audio player
52
+
53
  st.write(f"**File name**: {uploaded_file.name}")
54
  st.audio(uploaded_file, format=uploaded_file.type)
55
 
 
58
 
59
  # Read the uploaded file data
60
  waveform, sampling_rate = ta.load(uploaded_file.getvalue())
61
+ resampled_inp = ta.functional.resample(waveform, orig_freq=sampling_rate, new_freq=SAMPLING_RATE)
62
 
63
+ input_features = processor(resampled_inp[0], sampling_rate=16000, return_tensors='pt').input_features
64
+
65
+
66
+
67
+ ## Here Generate specific language!!!
68
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")
69
+
70
+
71
+ if task == "translate":
72
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
73
+ else:
74
+ predicted_ids = model.generate(input_features)
75
+ # decode token ids to text
76
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
77
+ st.write(transcription)
78
+ # print(waveform, sampling_rate)
79
  # Run transcription function and display
80
  # import pdb;pdb.set_trace()
81
  # st.write(audio_data.getvalue())