Spaces:
Sleeping
Sleeping
File size: 3,608 Bytes
a36df50 da304d0 44f51cc 1762ffe 44f51cc a36df50 44f51cc 1232794 44f51cc a36df50 44f51cc 1232794 44f51cc bfab15d 1232794 44f51cc 23cc1b1 44f51cc 1232794 44f51cc bfab15d 1232794 44f51cc 1232794 44f51cc 4ac2c78 44f51cc a36df50 44f51cc a36df50 44f51cc a36df50 44f51cc a36df50 44f51cc a36df50 44f51cc 1232794 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from pydub import AudioSegment
import re
# ์
๋ก๋ํ ๋ชจ๋ธ ๋ก๋
repo_name = "ireneminhee/speech-to-depression"
model = WhisperForConditionalGeneration.from_pretrained(repo_name)
processor = WhisperProcessor.from_pretrained(repo_name)
# ์์ฑ์ ํ
์คํธ๋ก ๋ณํํ๋ ํจ์
def transcribe(audio):
inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
generated_ids = model.generate(inputs.input_features)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
# ์ฐ์ธ์ฆ ์์ธก ๋ชจ๋ธ ๋ก๋
def load_model_from_safetensors(model_name, safetensors_path):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_name)
state_dict = torch.load(safetensors_path) # safetensors๋ฅผ ๋ชจ๋ธ๋ก ๋ก๋
model.load_state_dict(state_dict)
model.eval()
return model, tokenizer
# ์์ธก ํจ์
def predict_depression(sentences, model, tokenizer):
results = []
for sentence in sentences:
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
prediction = torch.argmax(logits, dim=-1).item()
results.append((sentence, prediction))
return results
# ์ ์ฒด ํ๋ก์ธ์ค๋ฅผ ์คํํ๋ ํจ์
def process_audio_and_predict(audio):
# 1. Whisper ๋ชจ๋ธ๋ก ์์ฑ์ ํ
์คํธ๋ก ๋ณํ
text = transcribe_audio(audio)
# 2. ํ
์คํธ๋ฅผ ๋ฌธ์ฅ ๋จ์๋ก ๋๋๊ธฐ
#sentences = split_sentences_using_gpt(text)
# 3. ๋ชจ๋ธ ๋ก๋ (๋ชจ๋ธ ๊ฒฝ๋ก์ ๋ง๊ฒ ์์ )
# ๋ชจ๋ธ๊ณผ tokenizer ๊ฒฝ๋ก (์ฌ์ฉ์ ํ๊ฒฝ์ ๋ง๊ฒ ์ค์ )
safetensors_path = "./model/model.safetensors" # SafeTensors ๋ชจ๋ธ ํ์ผ ๊ฒฝ๋ก
model_name = "klue/bert-base" # ๋ชจ๋ธ ์ด๋ฆ ๋๋ Hugging Face ๊ฒฝ๋ก
model, tokenizer = load_model_from_safetensors(model_name, safetensors_path)
# 4. ๋ฌธ์ฅ๋ณ๋ก ์ฐ์ธ ์ฆ์ ์์ธก
results = predict_depression(text, model, tokenizer)
# 5. ๊ฒฐ๊ณผ๋ฅผ ๋ฐํ
df_result = pd.DataFrame(results, columns=["Sentence", "Depression_Prediction"])
average_probability = df_result["Depression_Prediction"].mean()
return f"Average Depression Probability: {average_probability:.2f}"
# Gradio ์ธํฐํ์ด์ค๋ก ์ฐ๊ฒฐํ ํจ์
def gradio_process_audio(audio_data):
# ์ฌ์ฉ์๊ฐ ๋ง์ดํฌ๋ก ์
๋ ฅํ ์์ฑ์ ์์ ํ์ผ๋ก ์ ์ฅ
temp_audio_path = "temp_audio.wav"
with open(temp_audio_path, "wb") as f:
f.write(audio_data)
# ์ค๋์ค ์ฒ๋ฆฌ ๋ฐ ์์ธก
average_probability, df_result = process_audio_and_detect_depression(temp_audio_path, safetensors_path, model_name)
# ๊ฒฐ๊ณผ ์ถ๋ ฅ
return f"Average Depression Probability: {average_probability:.2f}", df_result
# Gradio ์ธํฐํ์ด์ค ์ ์
interface = gr.Interface(
fn=gradio_process_audio, # Gradio์์ ํธ์ถํ ํจ์
inputs=gr.Audio(type="numpy"), # ์ฌ์ฉ์ ์์ฑ ์
๋ ฅ (๋ง์ดํฌ)
outputs=[
gr.Textbox(label="Depression Probability"), # ํ๊ท ํ๋ฅ
gr.Dataframe(label="Sentence-wise Analysis") # ์์ธ ๋ถ์ ๊ฒฐ๊ณผ
],
title="Depression Detection from Audio",
description="Record your voice, and the model will analyze the text for depression likelihood."
)
# Gradio ์คํ
interface.launch(share=True) |