Bug: NumPy 2.0 breaks nvidia/parakeet-tdt-0.6b-v2

#12

by david44099 - opened May 6

May 6

Hey developers! I really need your help with this fix - it's super important for my project!
I want to use nvidia/parakeet-tdt-0.6b-v2 to replace whisperx in my colab notebook at github discord-transcription-bot, but I'm hitting this annoying error:

AttributeError: `np.sctypes` was removed in the NumPy 2.0 release. Access dtypes explicitly instead.

My Setup

Google Colab

What I Did

I follow your exact instructions, i.e.

pip install -U nemo_toolkit['asr']

install
restart session
Run

import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")

!wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav

output = asr_model.transcribe(['2086-149220-0033.wav'])

BUG/ERROR:

The following is the full ERROR, it fails 9/10 times, but I cannot reproduce the 10.th time that happened yesterday.

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-4-72ca978fb8dc> in <cell line: 0>()
----> 1 output = asr_model.transcribe(['2086-149220-0033.wav'])
      2 # print(output[0].text)

16 frames

/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    114     def decorate_context(*args, **kwargs):
    115         with ctx_factory():
--> 116             return func(*args, **kwargs)
    117 
    118     return decorate_context

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/models/rnnt_models.py in transcribe(self, audio, batch_size, return_hypotheses, partial_hypothesis, num_workers, channel_selector, augmentor, verbose, timestamps, override_config)
    305                 self.change_decoding_strategy(self.cfg.decoding, verbose=False)
    306 
--> 307         return super().transcribe(
    308             audio=audio,
    309             batch_size=batch_size,

/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    114     def decorate_context(*args, **kwargs):
    115         with ctx_factory():
--> 116             return func(*args, **kwargs)
    117 
    118     return decorate_context

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe(self, audio, batch_size, return_hypotheses, num_workers, channel_selector, augmentor, verbose, timestamps, override_config, **config_kwargs)
    267             generator = self.transcribe_generator(audio, override_config=transcribe_cfg)
    268 
--> 269             for processed_outputs in generator:
    270                 # Store results
    271                 if isinstance(processed_outputs, list):

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/mixins/transcription.py in transcribe_generator(self, audio, override_config)
    373                     verbose = True
    374 
--> 375                 for test_batch in tqdm(dataloader, desc="Transcribing", disable=not verbose):
    376                     # Move batch to device
    377                     test_batch = move_data_to_device(test_batch, transcribe_cfg._internal.device)

/usr/local/lib/python3.11/dist-packages/tqdm/std.py in __iter__(self)
   1179 
   1180         try:
-> 1181             for obj in iterable:
   1182                 yield obj
   1183                 # Update and possibly print the progressbar.

/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    706                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    707                 self._reset()  # type: ignore[call-arg]
--> 708             data = self._next_data()
    709             self._num_yielded += 1
    710             if (

/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    762     def _next_data(self):
    763         index = self._next_index()  # may raise StopIteration
--> 764         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    765         if self._pin_memory:
    766             data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     50                 data = self.dataset.__getitems__(possibly_batched_index)
     51             else:
---> 52                 data = [self.dataset[idx] for idx in possibly_batched_index]
     53         else:
     54             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     50                 data = self.dataset.__getitems__(possibly_batched_index)
     51             else:
---> 52                 data = [self.dataset[idx] for idx in possibly_batched_index]
     53         else:
     54             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/data/audio_to_text.py in __getitem__(self, index)
    488             return [self._process_sample(_index) for _index in index]
    489         else:
--> 490             return self._process_sample(index)
    491 
    492     def _process_sample(self, index):

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/data/audio_to_text.py in _process_sample(self, index)
    497             offset = 0
    498 
--> 499         features = self.featurizer.process(
    500             sample.audio_file,
    501             offset=offset,

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/features.py in process(self, file_path, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db)
    200         normalize_db=None,
    201     ):
--> 202         audio = AudioSegment.from_file(
    203             file_path,
    204             target_sr=self.sample_rate,

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in from_file(cls, audio_file, target_sr, int_values, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel)
    373             raise Exception(f"Your audio file {audio_file} could not be decoded. We tried using {libs}.")
    374 
--> 375         return cls(
    376             samples,
    377             sample_rate,

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in __init__(self, samples, sample_rate, target_sr, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel, audio_file, offset, duration)
    179         Samples are convert float32 internally, with int scaled to [-1, 1].
    180         """
--> 181         samples = self._convert_samples_to_float32(samples)
    182 
    183         # Check if channel selector is necessary

/usr/local/lib/python3.11/dist-packages/nemo/collections/asr/parts/preprocessing/segment.py in _convert_samples_to_float32(samples)
    261         """
    262         float32_samples = samples.astype('float32')
--> 263         if samples.dtype in np.sctypes['int']:
    264             bits = np.iinfo(samples.dtype).bits
    265             float32_samples *= 1.0 / 2 ** (bits - 1)

/usr/local/lib/python3.11/dist-packages/numpy/__init__.py in __getattr__(attr)
    395 
    396         if attr in __expired_attributes__:
--> 397             raise AttributeError(
    398                 f"`np.{attr}` was removed in the NumPy 2.0 release. "
    399                 f"{__expired_attributes__[attr]}"

AttributeError: `np.sctypes` was removed in the NumPy 2.0 release. Access dtypes explicitly instead.