Bug: NumPy 2.0 breaks nvidia/parakeet-tdt-0.6b-v2
Hey developers! I really need your help with this fix - it's super important for my project!
I want to use nvidia/parakeet-tdt-0.6b-v2
to replace whisperx
in my colab notebook at github discord-transcription-bot, but I'm hitting this annoying error:
AttributeError: `np.sctypes` was removed in the NumPy 2.0 release. Access dtypes explicitly instead.
My Setup
- Google Colab
What I Did
I follow your exact instructions, i.e.
pip install -U nemo_toolkit['asr']
- install
- restart session
- Run
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
!wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
output = asr_model.transcribe(['2086-149220-0033.wav'])
BUG/ERROR:
The following is the full ERROR, it fails 9/10 times, but I cannot reproduce the 10.th time that happened yesterday.
Transcribing: 0%| | 0/1 [00:00<?, ?it/s]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-72ca978fb8dc> in <cell line: 0>()
----> 1 output = asr_model.transcribe(['2086-149220-0033.wav'])
2 # print(output[0].text)
16 frames
/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
117
118 return decorate_context
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/models/rnnt_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, timestamps, override_config)
305 self.change_decoding_strategy(self.cfg.decoding, verbose=False)
306
--> 307 return super().transcribe(
308 audio=audio,
309 batch_size=batch_size,
/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
117
118 return decorate_context
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe(self, audio, batch_size, return_hypotheses, num_workers, channel_selector, augmentor, verbose, timestamps, override_config, **config_kwargs)
267 generator = self.transcribe_generator(audio, override_config=transcribe_cfg)
268
--> 269 for processed_outputs in generator:
270 # Store results
271 if isinstance(processed_outputs, list):
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe_generator(self, audio, override_config)
373 verbose = True
374
--> 375 for test_batch in tqdm(dataloader, desc="Transcribing", disable=not verbose):
376 # Move batch to device
377 test_batch = move_data_to_device(test_batch, transcribe_cfg._internal.device)
/usr/local/lib/python3.11/dist-packages/tqdm/std.py in __iter__(self)
1179
1180 try:
-> 1181 for obj in iterable:
1182 yield obj
1183 # Update and possibly print the progressbar.
/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py in __next__(self)
706 # TODO(https://github.com/pytorch/pytorch/issues/76750)
707 self._reset() # type: ignore[call-arg]
--> 708 data = self._next_data()
709 self._num_yielded += 1
710 if (
/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
762 def _next_data(self):
763 index = self._next_index() # may raise StopIteration
--> 764 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
765 if self._pin_memory:
766 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
50 data = self.dataset.__getitems__(possibly_batched_index)
51 else:
---> 52 data = [self.dataset[idx] for idx in possibly_batched_index]
53 else:
54 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/data/audio_to_text.py in __getitem__(self, index)
488 return [self._process_sample(_index) for _index in index]
489 else:
--> 490 return self._process_sample(index)
491
492 def _process_sample(self, index):
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/data/audio_to_text.py in _process_sample(self, index)
497 offset = 0
498
--> 499 features = self.featurizer.process(
500 sample.audio_file,
501 offset=offset,
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/features.py in process(self, file_path, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db)
200 normalize_db=None,
201 ):
--> 202 audio = AudioSegment.from_file(
203 file_path,
204 target_sr=self.sample_rate,
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in from_file(cls, audio_file, target_sr, int_values, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel)
373 raise Exception(f"Your audio file {audio_file} could not be decoded. We tried using {libs}.")
374
--> 375 return cls(
376 samples,
377 sample_rate,
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in __init__(self, samples, sample_rate, target_sr, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel, audio_file, offset, duration)
179 Samples are convert float32 internally, with int scaled to [-1, 1].
180 """
--> 181 samples = self._convert_samples_to_float32(samples)
182
183 # Check if channel selector is necessary
/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in _convert_samples_to_float32(samples)
261 """
262 float32_samples = samples.astype('float32')
--> 263 if samples.dtype in np.sctypes['int']:
264 bits = np.iinfo(samples.dtype).bits
265 float32_samples *= 1.0 / 2 ** (bits - 1)
/usr/local/lib/python3.11/dist-packages/numpy/__init__.py in __getattr__(attr)
395
396 if attr in __expired_attributes__:
--> 397 raise AttributeError(
398 f"`np.{attr}` was removed in the NumPy 2.0 release. "
399 f"{__expired_attributes__[attr]}"
AttributeError: `np.sctypes` was removed in the NumPy 2.0 release. Access dtypes explicitly instead.
This happens when a previous numpy>2 system was already installed in your env.
For now downgrade to numpy<2pip install numpy<2
In next release we will upgrade all NeMo collections to numpy > 2.0
Yes, I was hesitant because it could interfere with my other code.
THANKS, you're awesome! Keep it up!
Solution:!pip install "numpy<2.0.0"
Thanks, please share what you built later!
π Just finished a Discord Transcription Bot using parakeet-tdt-0.6b-v2
!
π§βπ» GitHub: discord-transcription-bot
π₯ Tutorial: YouTube video
π¬ Background: LinkedIn post
Thanks for the awesome model β worked great out of the box!