File size: 1,816 Bytes
10a86b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

import whisper
from transformers import pipeline
import torch
import torchaudio

# 🔹 Whisper transkripcija
def transcribe_text(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path, language='lt')
    return result.get("text", "").strip()

# 🔹 Whisper kalbos atpažinimas (su papildomu raktinių žodžių tikrinimu)
def recognize_language(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    text = result.get("text", "").strip()
    lang_code = result.get("language", "unknown")

    lower_text = text.lower()
    if any(word in lower_text for word in ["labas", "ačiū", "draugas", "vardas", "sekasi", "prašau"]):
        return "lt"
    elif any(word in lower_text for word in ["hello", "name", "how are you", "friend", "please"]):
        return "en"
    elif any(word in lower_text for word in ["hallo", "danke", "freund", "ich", "bitte"]):
        return "de"
    else:
        return lang_code

# 🔸 Wav2Vec2 transkripcija (su kalbos pasirinkimu)
def transcribe_text_wav2vec(audio_path, kalba):
    kalbos_modeliai = {
        "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian",
        "en": "facebook/wav2vec2-base-960h",
        "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
    }

    if kalba not in kalbos_modeliai:
        raise ValueError(f"Nepalaikoma kalba: {kalba}")

    pipe = pipeline(
        "automatic-speech-recognition",
        model=kalbos_modeliai[kalba]
    )

    speech_array, sampling_rate = torchaudio.load(audio_path)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)
    speech = speech_array[0].numpy()

    result = pipe(speech)
    return result["text"]