Yingxu He
commited on
Upload processor
Browse files- processing_meralion.py +4 -12
- processor_config.json +0 -1
- tokenizer_config.json +1 -1
processing_meralion.py
CHANGED
@@ -48,7 +48,6 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
48 |
tokenizer_class = "GemmaTokenizer"
|
49 |
valid_kwargs = [
|
50 |
"fixed_speech_embeds_length",
|
51 |
-
"speech_signature",
|
52 |
"speech_token_index",
|
53 |
"time_duration_limit",
|
54 |
"do_normalize"
|
@@ -59,13 +58,11 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
59 |
feature_extractor=None,
|
60 |
tokenizer=None,
|
61 |
fixed_speech_embeds_length=100,
|
62 |
-
speech_signature="<SpeechHere>",
|
63 |
speech_token_index=255999,
|
64 |
time_duration_limit=-1,
|
65 |
do_normalize=True
|
66 |
):
|
67 |
self.fixed_speech_embeds_length = fixed_speech_embeds_length
|
68 |
-
self.speech_signature = speech_signature
|
69 |
self.speech_token_index = speech_token_index
|
70 |
self.time_duration_limit = time_duration_limit
|
71 |
self.do_normalize = do_normalize
|
@@ -74,12 +71,12 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
74 |
|
75 |
self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
|
76 |
|
77 |
-
def _process_text(self, text
|
78 |
target_string = self.speech_token * self.fixed_speech_embeds_length
|
79 |
if isinstance(text, list) or isinstance(text, tuple):
|
80 |
-
pieces = [item.replace(
|
81 |
return pieces
|
82 |
-
return text.replace(
|
83 |
|
84 |
def _slice_audios(self, audios, time_duration_limit, sampling_rate):
|
85 |
if time_duration_limit <= 0:
|
@@ -101,7 +98,6 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
101 |
audios: Union[np.ndarray, List[np.ndarray]] = None,
|
102 |
padding: Union[bool, str, PaddingStrategy] = True,
|
103 |
sampling_rate: Optional[int] = None,
|
104 |
-
speech_signature: Optional[str] = None,
|
105 |
time_duration_limit: Optional[int] = None,
|
106 |
do_normalize: Optional[bool] = None,
|
107 |
**kwargs,
|
@@ -131,8 +127,6 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
131 |
lengths).
|
132 |
sampling_rate (`int`, defaults to 16000):
|
133 |
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
134 |
-
speech_signature (`str`, defaults to `<SpeechHere>`):
|
135 |
-
The special string marking the location of speech tokens.
|
136 |
time_duration_limit (`int`, defaults -1):
|
137 |
The max input time duration in seconds.
|
138 |
do_normalize (`bool`, defaults to `True`):
|
@@ -144,8 +138,6 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
144 |
raise ValueError("You need to specify either a `text` input to process.")
|
145 |
if sampling_rate is None:
|
146 |
sampling_rate = self.feature_extractor.sampling_rate
|
147 |
-
if speech_signature is None:
|
148 |
-
speech_signature = self.speech_signature
|
149 |
if time_duration_limit is None:
|
150 |
time_duration_limit = self.time_duration_limit
|
151 |
if do_normalize is None:
|
@@ -153,7 +145,7 @@ class MERaLiONProcessor(ProcessorMixin):
|
|
153 |
|
154 |
inputs_dict = {}
|
155 |
|
156 |
-
text = self._process_text(text
|
157 |
|
158 |
text_input = self.tokenizer(
|
159 |
text=text,
|
|
|
48 |
tokenizer_class = "GemmaTokenizer"
|
49 |
valid_kwargs = [
|
50 |
"fixed_speech_embeds_length",
|
|
|
51 |
"speech_token_index",
|
52 |
"time_duration_limit",
|
53 |
"do_normalize"
|
|
|
58 |
feature_extractor=None,
|
59 |
tokenizer=None,
|
60 |
fixed_speech_embeds_length=100,
|
|
|
61 |
speech_token_index=255999,
|
62 |
time_duration_limit=-1,
|
63 |
do_normalize=True
|
64 |
):
|
65 |
self.fixed_speech_embeds_length = fixed_speech_embeds_length
|
|
|
66 |
self.speech_token_index = speech_token_index
|
67 |
self.time_duration_limit = time_duration_limit
|
68 |
self.do_normalize = do_normalize
|
|
|
71 |
|
72 |
self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
|
73 |
|
74 |
+
def _process_text(self, text):
|
75 |
target_string = self.speech_token * self.fixed_speech_embeds_length
|
76 |
if isinstance(text, list) or isinstance(text, tuple):
|
77 |
+
pieces = [item.replace(self.speech_token, target_string) for item in text]
|
78 |
return pieces
|
79 |
+
return text.replace(self.speech_token, target_string)
|
80 |
|
81 |
def _slice_audios(self, audios, time_duration_limit, sampling_rate):
|
82 |
if time_duration_limit <= 0:
|
|
|
98 |
audios: Union[np.ndarray, List[np.ndarray]] = None,
|
99 |
padding: Union[bool, str, PaddingStrategy] = True,
|
100 |
sampling_rate: Optional[int] = None,
|
|
|
101 |
time_duration_limit: Optional[int] = None,
|
102 |
do_normalize: Optional[bool] = None,
|
103 |
**kwargs,
|
|
|
127 |
lengths).
|
128 |
sampling_rate (`int`, defaults to 16000):
|
129 |
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
|
|
|
|
130 |
time_duration_limit (`int`, defaults -1):
|
131 |
The max input time duration in seconds.
|
132 |
do_normalize (`bool`, defaults to `True`):
|
|
|
138 |
raise ValueError("You need to specify either a `text` input to process.")
|
139 |
if sampling_rate is None:
|
140 |
sampling_rate = self.feature_extractor.sampling_rate
|
|
|
|
|
141 |
if time_duration_limit is None:
|
142 |
time_duration_limit = self.time_duration_limit
|
143 |
if do_normalize is None:
|
|
|
145 |
|
146 |
inputs_dict = {}
|
147 |
|
148 |
+
text = self._process_text(text)
|
149 |
|
150 |
text_input = self.tokenizer(
|
151 |
text=text,
|
processor_config.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"do_normalize": true,
|
6 |
"fixed_speech_embeds_length": 100,
|
7 |
"processor_class": "MERaLiONProcessor",
|
8 |
-
"speech_signature": "<SpeechHere>",
|
9 |
"speech_token_index": 255999,
|
10 |
"time_duration_limit": -1
|
11 |
}
|
|
|
5 |
"do_normalize": true,
|
6 |
"fixed_speech_embeds_length": 100,
|
7 |
"processor_class": "MERaLiONProcessor",
|
|
|
8 |
"speech_token_index": 255999,
|
9 |
"time_duration_limit": -1
|
10 |
}
|
tokenizer_config.json
CHANGED
@@ -1987,7 +1987,7 @@
|
|
1987 |
"special": false
|
1988 |
},
|
1989 |
"255999": {
|
1990 |
-
"content": "<
|
1991 |
"lstrip": false,
|
1992 |
"normalized": false,
|
1993 |
"rstrip": false,
|
|
|
1987 |
"special": false
|
1988 |
},
|
1989 |
"255999": {
|
1990 |
+
"content": "<unused99>",
|
1991 |
"lstrip": false,
|
1992 |
"normalized": false,
|
1993 |
"rstrip": false,
|