Spaces:
Sleeping
Sleeping
# import sounddevice as sd | |
# import soundfile as sf | |
# import speech_recognition as sr | |
# from gtts import gTTS | |
# import pygame | |
# import time | |
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
# class AvishkaaramEkta: | |
# def __init__(self, model): | |
# self.model = model | |
# self.tokenizer = tokenizer | |
# def text_to_speech(self, text, output_file): | |
# # Create a gTTS object with the text and desired language | |
# tts = gTTS(text=text, lang='en') | |
# # Save the audio to a file | |
# tts.save(output_file) | |
# def play_mp3(self, file_path): | |
# pygame.mixer.init() | |
# pygame.mixer.music.load(file_path) | |
# pygame.mixer.music.play() | |
# while pygame.mixer.music.get_busy(): | |
# continue | |
# def ask_question(self, audio_file): | |
# print("Recording audio...") | |
# audio = sd.rec(int(44100 * 6), samplerate=44100, channels=1) | |
# sd.wait() | |
# # Save the audio to a file | |
# sf.write(audio_file, audio, 44100) | |
# print(f"Audio saved to {audio_file}") | |
# r = sr.Recognizer() | |
# with sr.AudioFile(audio_file) as source: | |
# audio_data = r.record(source) | |
# text = "" | |
# try: | |
# text = r.recognize_google(audio_data) | |
# print("Transcription:", text) | |
# except sr.UnknownValueError: | |
# print("Speech recognition could not understand audio") | |
# except sr.RequestError as e: | |
# print("Could not request results from Google Speech Recognition service; {0}".format(e)) | |
# return text | |
# def answer_question(self, passage, question): | |
# inputs = self.tokenizer(passage, question, return_tensors="pt") | |
# outputs = self.model(**inputs) | |
# start_logits = outputs.start_logits | |
# end_logits = outputs.end_logits | |
# start_index = start_logits.argmax(dim=1).item() | |
# end_index = end_logits.argmax(dim=1).item() | |
# tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
# answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) | |
# return answer | |
# def question_answer(self, passage, question): | |
# passage_audio_file = "passage.mp3" | |
# question_audio_file = "question.wav" | |
# answer_audio_file = "answer.mp3" | |
# self.text_to_speech(passage, passage_audio_file) | |
# self.play_mp3(passage_audio_file) | |
# question_text = self.ask_question(question_audio_file) | |
# answer = self.answer_question(passage, question_text) | |
# self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) | |
# self.play_mp3(answer_audio_file) | |
# time.sleep(5) # Wait for 5 seconds before ending | |
# return answer | |
# # Create an instance of the AvishkaaramEkta class | |
# avishkaaram_ekta = AvishkaaramEkta(model) | |
# # Define the Gradio interface | |
# iface = gr.Interface( | |
# fn=avishkaaram_ekta.question_answer, | |
# inputs=["text", "text"], | |
# outputs="text", | |
# title="Audio Question Answering", | |
# description="Ask a question about a given passage using audio input", | |
# examples=[ | |
# ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"], | |
# ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"], | |
# ], | |
# interpretation="default", | |
# ) | |
# # Launch the Gradio interface | |
# iface.launch() | |
# import torch | |
# import torchaudio | |
# import soundfile as sf | |
# import speech_recognition as sr | |
# from gtts import gTTS | |
# import pygame | |
# import time | |
# import gradio as gr | |
# import os | |
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
# os.environ['SDL_AUDIODRIVER'] = 'dsp' | |
# class AvishkaaramEkta: | |
# def __init__(self, model): | |
# self.model = model | |
# self.tokenizer = tokenizer | |
# def text_to_speech(self, text, output_file): | |
# # Create a gTTS object with the text and desired language | |
# tts = gTTS(text=text, lang='en') | |
# # Save the audio to a file | |
# tts.save(output_file) | |
# def play_mp3(self, file_path): | |
# pygame.mixer.init() | |
# pygame.mixer.music.load(file_path) | |
# pygame.mixer.music.play() | |
# while pygame.mixer.music.get_busy(): | |
# continue | |
# def ask_question(self, audio_file): | |
# print("Recording audio...") | |
# waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1) | |
# # Save the audio to a file | |
# sf.write(audio_file, waveform.squeeze().numpy(), sample_rate) | |
# print(f"Audio saved to {audio_file}") | |
# r = sr.Recognizer() | |
# with sr.AudioFile(audio_file) as source: | |
# audio_data = r.record(source) | |
# text = "" | |
# try: | |
# text = r.recognize_google(audio_data) | |
# print("Transcription:", text) | |
# except sr.UnknownValueError: | |
# print("Speech recognition could not understand audio") | |
# except sr.RequestError as e: | |
# print("Could not request results from Google Speech Recognition service; {0}".format(e)) | |
# return text | |
# def answer_question(self, passage, question): | |
# inputs = self.tokenizer(passage, question, return_tensors="pt") | |
# outputs = self.model(**inputs) | |
# start_logits = outputs.start_logits | |
# end_logits = outputs.end_logits | |
# start_index = start_logits.argmax(dim=1).item() | |
# end_index = end_logits.argmax(dim=1).item() | |
# tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
# answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) | |
# return answer | |
# def question_answer(self, passage, question): | |
# passage_audio_file = "passage.mp3" | |
# question_audio_file = "question.wav" | |
# answer_audio_file = "answer.mp3" | |
# self.text_to_speech(passage, passage_audio_file) | |
# self.play_mp3(passage_audio_file) | |
# question_text = self.ask_question(question_audio_file) | |
# answer = self.answer_question(passage, question_text) | |
# self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) | |
# self.play_mp3(answer_audio_file) | |
# time.sleep(5) # Wait for 5 seconds before ending | |
# return answer | |
# # Create an instance of the AvishkaaramEkta class | |
# avishkaaram_ekta = AvishkaaramEkta(model) | |
# # Define the Gradio interface | |
# iface = gr.Interface( | |
# fn=avishkaaram_ekta.question_answer, | |
# inputs=["text", "text"], | |
# outputs="text", | |
# title="Audio Question Answering", | |
# description="Ask a question about a given passage using audio input", | |
# examples=[ | |
# ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"], | |
# ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"], | |
# ], | |
# interpretation="default", | |
# ) | |
# # Launch the Gradio interface | |
# iface.launch() | |
import torch | |
import torchaudio | |
import soundfile as sf | |
import speech_recognition as sr | |
from gtts import gTTS | |
import pygame | |
import time | |
import os | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
os.environ["SDL_AUDIODRIVER"] = "pulseaudio" | |
model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') | |
class AvishkaaramEkta: | |
def __init__(self, model): | |
self.model = model | |
self.tokenizer = tokenizer | |
def text_to_speech(self, text, output_file): | |
# Create a gTTS object with the text and desired language | |
tts = gTTS(text=text, lang='en') | |
# Save the audio to a file | |
tts.save(output_file) | |
def play_mp3(self, file_path): | |
pygame.mixer.init() | |
pygame.mixer.music.load(file_path) | |
pygame.mixer.music.play() | |
while pygame.mixer.music.get_busy(): | |
continue | |
def ask_question(self, audio_file): | |
print("Recording audio...") | |
waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1) | |
# Save the audio to a file | |
sf.write(audio_file, waveform.squeeze().numpy(), sample_rate) | |
print(f"Audio saved to {audio_file}") | |
r = sr.Recognizer() | |
with sr.AudioFile(audio_file) as source: | |
audio_data = r.record(source) | |
text = "" | |
try: | |
text = r.recognize_google(audio_data) | |
print("Transcription:", text) | |
except sr.UnknownValueError: | |
print("Speech recognition could not understand audio") | |
except sr.RequestError as e: | |
print("Could not request results from Google Speech Recognition service; {0}".format(e)) | |
return text | |
def answer_question(self, passage, question): | |
inputs = self.tokenizer(passage, question, return_tensors="pt") | |
outputs = self.model(**inputs) | |
start_logits = outputs.start_logits | |
end_logits = outputs.end_logits | |
start_index = start_logits.argmax(dim=1).item() | |
end_index = end_logits.argmax(dim=1).item() | |
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) | |
return answer | |
def question_answer(self, passage, question): | |
passage_audio_file = "passage.mp3" | |
question_audio_file = "question.wav" | |
answer_audio_file = "answer.mp3" | |
self.text_to_speech(passage, passage_audio_file) | |
self.play_mp3(passage_audio_file) | |
question_text = self.ask_question(question_audio_file) | |
answer = self.answer_question(passage, question_text) | |
self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) | |
self.play_mp3(answer_audio_file) | |
time.sleep(5) # Wait for 5 seconds before ending | |
return answer | |
# Create an instance of the AvishkaaramEkta class | |
avishkaaram_ekta = AvishkaaramEkta(model) | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=avishkaaram_ekta.question_answer, | |
inputs=["text", "text"], | |
outputs="text", | |
title="Audio Question Answering", | |
description="Ask a question about a given passage using audio input", | |
examples=[ | |
["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania, to study the behavior of chimpanzees in the wild."], | |
["What was the purpose of Dr. Jane Goodall's visit to Gombe?"] | |
] | |
) | |
# Launch the interface | |
iface.launch() | |