Morgan Funtowicz commited on
Commit
7ff080c
·
1 Parent(s): 6eeb890

feat(nemo): initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. handler.py +104 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG SDK_VERSION=latest
2
+ FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
3
+
4
+ FROM nvcr.io/nvidia/nemo:25.04
5
+ RUN --mount=type=bind,from=sdk,source=/opt/hfendpoints/dist,target=/usr/local/endpoints/dist \
6
+ --mount=type=bind,source=requirements.txt,target=/tmp/requirements.txt \
7
+ python3 -m pip install -r /tmp/requirements.txt && \
8
+ python3 -m pip install /usr/local/endpoints/dist/*.whl
9
+
10
+
11
+ COPY handler.py /usr/local/endpoint/
12
+
13
+ # Disable TQDM
14
+ ENV TQDM_DISABLE=1
15
+
16
+ # Network interface
17
+ ENV INTERFACE=0.0.0.0
18
+ ENV PORT=80
19
+
20
+ EXPOSE 80
21
+
22
+ ENTRYPOINT ["python3"]
23
+ CMD ["/usr/local/endpoint/handler.py"]
handler.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import zlib
3
+ from functools import partial
4
+ from io import BytesIO
5
+
6
+ from hfendpoints.openai import Context, run
7
+ from hfendpoints.openai.audio import AutomaticSpeechRecognitionEndpoint, SegmentBuilder, Segment, \
8
+ TranscriptionRequest, TranscriptionResponse, TranscriptionResponseKind, VerboseTranscription
9
+ from librosa import load as load_audio, get_duration
10
+ from loguru import logger
11
+ from nemo.collections.asr.models import ASRModel
12
+
13
+ from hfendpoints import EndpointConfig, Handler, __version__
14
+
15
+
16
+ def compression_ratio(text: str) -> float:
17
+ """
18
+ :param text:
19
+ :return:
20
+ """
21
+ text_bytes = text.encode("utf-8")
22
+ return len(text_bytes) / len(zlib.compress(text_bytes))
23
+
24
+
25
+ def get_segment(idx: int, segment, tokenizer, request: TranscriptionRequest) -> Segment:
26
+ return SegmentBuilder() \
27
+ .id(idx) \
28
+ .start(segment['start']) \
29
+ .end(segment['end']) \
30
+ .text(segment['segment']) \
31
+ .tokens(tokenizer.text_to_ids(segment['segment'])) \
32
+ .temperature(request.temperature) \
33
+ .compression_ratio(compression_ratio(segment['segment'])) \
34
+ .build()
35
+
36
+
37
+ class NemoAsrHandler(Handler):
38
+ __slots__ = ("_model",)
39
+
40
+ def __init__(self, config: EndpointConfig):
41
+ logger.info(config.repository)
42
+ self._model = ASRModel.from_pretrained(model_name=str(config.repository)).eval()
43
+
44
+ async def __call__(self, request: TranscriptionRequest, ctx: Context) -> TranscriptionResponse:
45
+ with logger.contextualize(request_id=ctx.request_id):
46
+ with memoryview(request) as audio:
47
+ (waveform, sampling) = load_audio(BytesIO(audio), sr=16000, mono=True)
48
+ logger.debug(
49
+ f"Successfully decoded {len(waveform)} bytes PCM audio chunk"
50
+ )
51
+
52
+ # Do we need to compute the timestamps?
53
+ needs_timestamps = request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
54
+ transcribe_f = partial(self._model.transcribe, timestamps=needs_timestamps, verbose=False)
55
+
56
+ outputs = await asyncio.get_running_loop().run_in_executor(
57
+ None,
58
+ transcribe_f,
59
+ (waveform,)
60
+ )
61
+
62
+ output = outputs[0]
63
+ text = output.text
64
+
65
+ match request.response_kind:
66
+ case TranscriptionResponseKind.VERBOSE_JSON:
67
+ segment_timestamps = output.timestamp['segment']
68
+ segments = [
69
+ get_segment(idx, stamp, self._model.tokenizer, request)
70
+ for (idx, stamp) in enumerate(segment_timestamps)
71
+ ]
72
+
73
+ logger.info(f"Segment: {segment_timestamps[0]}")
74
+
75
+ return TranscriptionResponse.verbose(
76
+ VerboseTranscription(
77
+ text=text,
78
+ duration=get_duration(y=waveform, sr=sampling),
79
+ language=request.language,
80
+ segments=segments,
81
+ # word=None
82
+ )
83
+ )
84
+ case TranscriptionResponseKind.JSON:
85
+ return TranscriptionResponse.json(text)
86
+
87
+ case TranscriptionResponseKind.TEXT:
88
+ return TranscriptionResponse.text(text)
89
+
90
+ # Theoretically, we can't end up there as Rust validates the enum value beforehand
91
+ raise RuntimeError(f"unknown response_kind: {request.response_kind}")
92
+
93
+
94
+ def entrypoint():
95
+ config = EndpointConfig.from_env()
96
+ handler = NemoAsrHandler(config)
97
+ endpoint = AutomaticSpeechRecognitionEndpoint(handler)
98
+
99
+ logger.info(f"[Hugging Face Endpoint v{__version__}] Serving: {config.model_id}")
100
+ run(endpoint, config.interface, config.port)
101
+
102
+
103
+ if __name__ == '__main__':
104
+ entrypoint()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ huggingface_hub [hf_xet]
2
+ librosa >= 0.11.0
3
+ nemo_toolkit [asr] >= 2.3.0
4
+ numpy
5
+ tqdm