hfendpoints-images
/

nvidia-nemo-asr

Model card Files Files and versions Community

Morgan Funtowicz commited on 17 days ago

Commit

7ff080c

1 Parent(s): 6eeb890

feat(nemo): initial commit

Browse files

Files changed (3) hide show

Dockerfile +23 -0
handler.py +104 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+ARG SDK_VERSION=latest
+FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
+FROM nvcr.io/nvidia/nemo:25.04
+RUN --mount=type=bind,from=sdk,source=/opt/hfendpoints/dist,target=/usr/local/endpoints/dist \
+    --mount=type=bind,source=requirements.txt,target=/tmp/requirements.txt \
+    python3 -m pip install -r /tmp/requirements.txt && \
+    python3 -m pip install /usr/local/endpoints/dist/*.whl
+COPY handler.py /usr/local/endpoint/
+# Disable TQDM
+ENV TQDM_DISABLE=1
+# Network interface
+ENV INTERFACE=0.0.0.0
+ENV PORT=80
+EXPOSE 80
+ENTRYPOINT ["python3"]
+CMD ["/usr/local/endpoint/handler.py"]

handler.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import asyncio
+import zlib
+from functools import partial
+from io import BytesIO
+from hfendpoints.openai import Context, run
+from hfendpoints.openai.audio import AutomaticSpeechRecognitionEndpoint, SegmentBuilder, Segment, \
+    TranscriptionRequest, TranscriptionResponse, TranscriptionResponseKind, VerboseTranscription
+from librosa import load as load_audio, get_duration
+from loguru import logger
+from nemo.collections.asr.models import ASRModel
+from hfendpoints import EndpointConfig, Handler, __version__
+def compression_ratio(text: str) -> float:
+    """
+    :param text:
+    :return:
+    """
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def get_segment(idx: int, segment, tokenizer, request: TranscriptionRequest) -> Segment:
+    return SegmentBuilder() \
+        .id(idx) \
+        .start(segment['start']) \
+        .end(segment['end']) \
+        .text(segment['segment']) \
+        .tokens(tokenizer.text_to_ids(segment['segment'])) \
+        .temperature(request.temperature) \
+        .compression_ratio(compression_ratio(segment['segment'])) \
+        .build()
+class NemoAsrHandler(Handler):
+    __slots__ = ("_model",)
+    def __init__(self, config: EndpointConfig):
+        logger.info(config.repository)
+        self._model = ASRModel.from_pretrained(model_name=str(config.repository)).eval()
+    async def __call__(self, request: TranscriptionRequest, ctx: Context) -> TranscriptionResponse:
+        with logger.contextualize(request_id=ctx.request_id):
+            with memoryview(request) as audio:
+                (waveform, sampling) = load_audio(BytesIO(audio), sr=16000, mono=True)
+                logger.debug(
+                    f"Successfully decoded {len(waveform)} bytes PCM audio chunk"
+                )
+                # Do we need to compute the timestamps?
+                needs_timestamps = request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
+                transcribe_f = partial(self._model.transcribe, timestamps=needs_timestamps, verbose=False)
+                outputs = await asyncio.get_running_loop().run_in_executor(
+                    None,
+                    transcribe_f,
+                    (waveform,)
+                )
+                output = outputs[0]
+                text = output.text
+                match request.response_kind:
+                    case TranscriptionResponseKind.VERBOSE_JSON:
+                        segment_timestamps = output.timestamp['segment']
+                        segments = [
+                            get_segment(idx, stamp, self._model.tokenizer, request)
+                            for (idx, stamp) in enumerate(segment_timestamps)
+                        ]
+                        logger.info(f"Segment: {segment_timestamps[0]}")
+                        return TranscriptionResponse.verbose(
+                            VerboseTranscription(
+                                text=text,
+                                duration=get_duration(y=waveform, sr=sampling),
+                                language=request.language,
+                                segments=segments,
+                                # word=None
+                            )
+                        )
+                    case TranscriptionResponseKind.JSON:
+                        return TranscriptionResponse.json(text)
+                    case TranscriptionResponseKind.TEXT:
+                        return TranscriptionResponse.text(text)
+                # Theoretically, we can't end up there as Rust validates the enum value beforehand
+                raise RuntimeError(f"unknown response_kind: {request.response_kind}")
+def entrypoint():
+    config = EndpointConfig.from_env()
+    handler = NemoAsrHandler(config)
+    endpoint = AutomaticSpeechRecognitionEndpoint(handler)
+    logger.info(f"[Hugging Face Endpoint v{__version__}] Serving: {config.model_id}")
+    run(endpoint, config.interface, config.port)
+if __name__ == '__main__':
+    entrypoint()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+huggingface_hub [hf_xet]
+librosa >= 0.11.0
+nemo_toolkit [asr] >= 2.3.0
+numpy
+tqdm