Morgan Funtowicz
commited on
Commit
·
7ff080c
1
Parent(s):
6eeb890
feat(nemo): initial commit
Browse files- Dockerfile +23 -0
- handler.py +104 -0
- requirements.txt +5 -0
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG SDK_VERSION=latest
|
2 |
+
FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
|
3 |
+
|
4 |
+
FROM nvcr.io/nvidia/nemo:25.04
|
5 |
+
RUN --mount=type=bind,from=sdk,source=/opt/hfendpoints/dist,target=/usr/local/endpoints/dist \
|
6 |
+
--mount=type=bind,source=requirements.txt,target=/tmp/requirements.txt \
|
7 |
+
python3 -m pip install -r /tmp/requirements.txt && \
|
8 |
+
python3 -m pip install /usr/local/endpoints/dist/*.whl
|
9 |
+
|
10 |
+
|
11 |
+
COPY handler.py /usr/local/endpoint/
|
12 |
+
|
13 |
+
# Disable TQDM
|
14 |
+
ENV TQDM_DISABLE=1
|
15 |
+
|
16 |
+
# Network interface
|
17 |
+
ENV INTERFACE=0.0.0.0
|
18 |
+
ENV PORT=80
|
19 |
+
|
20 |
+
EXPOSE 80
|
21 |
+
|
22 |
+
ENTRYPOINT ["python3"]
|
23 |
+
CMD ["/usr/local/endpoint/handler.py"]
|
handler.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import zlib
|
3 |
+
from functools import partial
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
from hfendpoints.openai import Context, run
|
7 |
+
from hfendpoints.openai.audio import AutomaticSpeechRecognitionEndpoint, SegmentBuilder, Segment, \
|
8 |
+
TranscriptionRequest, TranscriptionResponse, TranscriptionResponseKind, VerboseTranscription
|
9 |
+
from librosa import load as load_audio, get_duration
|
10 |
+
from loguru import logger
|
11 |
+
from nemo.collections.asr.models import ASRModel
|
12 |
+
|
13 |
+
from hfendpoints import EndpointConfig, Handler, __version__
|
14 |
+
|
15 |
+
|
16 |
+
def compression_ratio(text: str) -> float:
|
17 |
+
"""
|
18 |
+
:param text:
|
19 |
+
:return:
|
20 |
+
"""
|
21 |
+
text_bytes = text.encode("utf-8")
|
22 |
+
return len(text_bytes) / len(zlib.compress(text_bytes))
|
23 |
+
|
24 |
+
|
25 |
+
def get_segment(idx: int, segment, tokenizer, request: TranscriptionRequest) -> Segment:
|
26 |
+
return SegmentBuilder() \
|
27 |
+
.id(idx) \
|
28 |
+
.start(segment['start']) \
|
29 |
+
.end(segment['end']) \
|
30 |
+
.text(segment['segment']) \
|
31 |
+
.tokens(tokenizer.text_to_ids(segment['segment'])) \
|
32 |
+
.temperature(request.temperature) \
|
33 |
+
.compression_ratio(compression_ratio(segment['segment'])) \
|
34 |
+
.build()
|
35 |
+
|
36 |
+
|
37 |
+
class NemoAsrHandler(Handler):
|
38 |
+
__slots__ = ("_model",)
|
39 |
+
|
40 |
+
def __init__(self, config: EndpointConfig):
|
41 |
+
logger.info(config.repository)
|
42 |
+
self._model = ASRModel.from_pretrained(model_name=str(config.repository)).eval()
|
43 |
+
|
44 |
+
async def __call__(self, request: TranscriptionRequest, ctx: Context) -> TranscriptionResponse:
|
45 |
+
with logger.contextualize(request_id=ctx.request_id):
|
46 |
+
with memoryview(request) as audio:
|
47 |
+
(waveform, sampling) = load_audio(BytesIO(audio), sr=16000, mono=True)
|
48 |
+
logger.debug(
|
49 |
+
f"Successfully decoded {len(waveform)} bytes PCM audio chunk"
|
50 |
+
)
|
51 |
+
|
52 |
+
# Do we need to compute the timestamps?
|
53 |
+
needs_timestamps = request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
|
54 |
+
transcribe_f = partial(self._model.transcribe, timestamps=needs_timestamps, verbose=False)
|
55 |
+
|
56 |
+
outputs = await asyncio.get_running_loop().run_in_executor(
|
57 |
+
None,
|
58 |
+
transcribe_f,
|
59 |
+
(waveform,)
|
60 |
+
)
|
61 |
+
|
62 |
+
output = outputs[0]
|
63 |
+
text = output.text
|
64 |
+
|
65 |
+
match request.response_kind:
|
66 |
+
case TranscriptionResponseKind.VERBOSE_JSON:
|
67 |
+
segment_timestamps = output.timestamp['segment']
|
68 |
+
segments = [
|
69 |
+
get_segment(idx, stamp, self._model.tokenizer, request)
|
70 |
+
for (idx, stamp) in enumerate(segment_timestamps)
|
71 |
+
]
|
72 |
+
|
73 |
+
logger.info(f"Segment: {segment_timestamps[0]}")
|
74 |
+
|
75 |
+
return TranscriptionResponse.verbose(
|
76 |
+
VerboseTranscription(
|
77 |
+
text=text,
|
78 |
+
duration=get_duration(y=waveform, sr=sampling),
|
79 |
+
language=request.language,
|
80 |
+
segments=segments,
|
81 |
+
# word=None
|
82 |
+
)
|
83 |
+
)
|
84 |
+
case TranscriptionResponseKind.JSON:
|
85 |
+
return TranscriptionResponse.json(text)
|
86 |
+
|
87 |
+
case TranscriptionResponseKind.TEXT:
|
88 |
+
return TranscriptionResponse.text(text)
|
89 |
+
|
90 |
+
# Theoretically, we can't end up there as Rust validates the enum value beforehand
|
91 |
+
raise RuntimeError(f"unknown response_kind: {request.response_kind}")
|
92 |
+
|
93 |
+
|
94 |
+
def entrypoint():
|
95 |
+
config = EndpointConfig.from_env()
|
96 |
+
handler = NemoAsrHandler(config)
|
97 |
+
endpoint = AutomaticSpeechRecognitionEndpoint(handler)
|
98 |
+
|
99 |
+
logger.info(f"[Hugging Face Endpoint v{__version__}] Serving: {config.model_id}")
|
100 |
+
run(endpoint, config.interface, config.port)
|
101 |
+
|
102 |
+
|
103 |
+
if __name__ == '__main__':
|
104 |
+
entrypoint()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub [hf_xet]
|
2 |
+
librosa >= 0.11.0
|
3 |
+
nemo_toolkit [asr] >= 2.3.0
|
4 |
+
numpy
|
5 |
+
tqdm
|