# import sounddevice as sd # import soundfile as sf # import speech_recognition as sr # from gtts import gTTS # import pygame # import time # import gradio as gr # from transformers import AutoTokenizer, AutoModelForQuestionAnswering # model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') # tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') # class AvishkaaramEkta: # def __init__(self, model): # self.model = model # self.tokenizer = tokenizer # def text_to_speech(self, text, output_file): # # Create a gTTS object with the text and desired language # tts = gTTS(text=text, lang='en') # # Save the audio to a file # tts.save(output_file) # def play_mp3(self, file_path): # pygame.mixer.init() # pygame.mixer.music.load(file_path) # pygame.mixer.music.play() # while pygame.mixer.music.get_busy(): # continue # def ask_question(self, audio_file): # print("Recording audio...") # audio = sd.rec(int(44100 * 6), samplerate=44100, channels=1) # sd.wait() # # Save the audio to a file # sf.write(audio_file, audio, 44100) # print(f"Audio saved to {audio_file}") # r = sr.Recognizer() # with sr.AudioFile(audio_file) as source: # audio_data = r.record(source) # text = "" # try: # text = r.recognize_google(audio_data) # print("Transcription:", text) # except sr.UnknownValueError: # print("Speech recognition could not understand audio") # except sr.RequestError as e: # print("Could not request results from Google Speech Recognition service; {0}".format(e)) # return text # def answer_question(self, passage, question): # inputs = self.tokenizer(passage, question, return_tensors="pt") # outputs = self.model(**inputs) # start_logits = outputs.start_logits # end_logits = outputs.end_logits # start_index = start_logits.argmax(dim=1).item() # end_index = end_logits.argmax(dim=1).item() # tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) # return answer # def question_answer(self, passage, question): # passage_audio_file = "passage.mp3" # question_audio_file = "question.wav" # answer_audio_file = "answer.mp3" # self.text_to_speech(passage, passage_audio_file) # self.play_mp3(passage_audio_file) # question_text = self.ask_question(question_audio_file) # answer = self.answer_question(passage, question_text) # self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) # self.play_mp3(answer_audio_file) # time.sleep(5) # Wait for 5 seconds before ending # return answer # # Create an instance of the AvishkaaramEkta class # avishkaaram_ekta = AvishkaaramEkta(model) # # Define the Gradio interface # iface = gr.Interface( # fn=avishkaaram_ekta.question_answer, # inputs=["text", "text"], # outputs="text", # title="Audio Question Answering", # description="Ask a question about a given passage using audio input", # examples=[ # ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"], # ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"], # ], # interpretation="default", # ) # # Launch the Gradio interface # iface.launch() # import torch # import torchaudio # import soundfile as sf # import speech_recognition as sr # from gtts import gTTS # import pygame # import time # import gradio as gr # import os # from transformers import AutoTokenizer, AutoModelForQuestionAnswering # model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') # tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') # os.environ['SDL_AUDIODRIVER'] = 'dsp' # class AvishkaaramEkta: # def __init__(self, model): # self.model = model # self.tokenizer = tokenizer # def text_to_speech(self, text, output_file): # # Create a gTTS object with the text and desired language # tts = gTTS(text=text, lang='en') # # Save the audio to a file # tts.save(output_file) # def play_mp3(self, file_path): # pygame.mixer.init() # pygame.mixer.music.load(file_path) # pygame.mixer.music.play() # while pygame.mixer.music.get_busy(): # continue # def ask_question(self, audio_file): # print("Recording audio...") # waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1) # # Save the audio to a file # sf.write(audio_file, waveform.squeeze().numpy(), sample_rate) # print(f"Audio saved to {audio_file}") # r = sr.Recognizer() # with sr.AudioFile(audio_file) as source: # audio_data = r.record(source) # text = "" # try: # text = r.recognize_google(audio_data) # print("Transcription:", text) # except sr.UnknownValueError: # print("Speech recognition could not understand audio") # except sr.RequestError as e: # print("Could not request results from Google Speech Recognition service; {0}".format(e)) # return text # def answer_question(self, passage, question): # inputs = self.tokenizer(passage, question, return_tensors="pt") # outputs = self.model(**inputs) # start_logits = outputs.start_logits # end_logits = outputs.end_logits # start_index = start_logits.argmax(dim=1).item() # end_index = end_logits.argmax(dim=1).item() # tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) # return answer # def question_answer(self, passage, question): # passage_audio_file = "passage.mp3" # question_audio_file = "question.wav" # answer_audio_file = "answer.mp3" # self.text_to_speech(passage, passage_audio_file) # self.play_mp3(passage_audio_file) # question_text = self.ask_question(question_audio_file) # answer = self.answer_question(passage, question_text) # self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) # self.play_mp3(answer_audio_file) # time.sleep(5) # Wait for 5 seconds before ending # return answer # # Create an instance of the AvishkaaramEkta class # avishkaaram_ekta = AvishkaaramEkta(model) # # Define the Gradio interface # iface = gr.Interface( # fn=avishkaaram_ekta.question_answer, # inputs=["text", "text"], # outputs="text", # title="Audio Question Answering", # description="Ask a question about a given passage using audio input", # examples=[ # ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"], # ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"], # ], # interpretation="default", # ) # # Launch the Gradio interface # iface.launch() import torch import torchaudio import soundfile as sf import speech_recognition as sr from gtts import gTTS import pygame import time import os import gradio as gr from transformers import AutoTokenizer, AutoModelForQuestionAnswering os.environ["SDL_AUDIODRIVER"] = "pulseaudio" model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi') class AvishkaaramEkta: def __init__(self, model): self.model = model self.tokenizer = tokenizer def text_to_speech(self, text, output_file): # Create a gTTS object with the text and desired language tts = gTTS(text=text, lang='en') # Save the audio to a file tts.save(output_file) def play_mp3(self, file_path): pygame.mixer.init() pygame.mixer.music.load(file_path) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): continue def ask_question(self, audio_file): print("Recording audio...") waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1) # Save the audio to a file sf.write(audio_file, waveform.squeeze().numpy(), sample_rate) print(f"Audio saved to {audio_file}") r = sr.Recognizer() with sr.AudioFile(audio_file) as source: audio_data = r.record(source) text = "" try: text = r.recognize_google(audio_data) print("Transcription:", text) except sr.UnknownValueError: print("Speech recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) return text def answer_question(self, passage, question): inputs = self.tokenizer(passage, question, return_tensors="pt") outputs = self.model(**inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits start_index = start_logits.argmax(dim=1).item() end_index = end_logits.argmax(dim=1).item() tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1]) return answer def question_answer(self, passage, question): passage_audio_file = "passage.mp3" question_audio_file = "question.wav" answer_audio_file = "answer.mp3" self.text_to_speech(passage, passage_audio_file) self.play_mp3(passage_audio_file) question_text = self.ask_question(question_audio_file) answer = self.answer_question(passage, question_text) self.text_to_speech("The answer to the question is: " + answer, answer_audio_file) self.play_mp3(answer_audio_file) time.sleep(5) # Wait for 5 seconds before ending return answer # Create an instance of the AvishkaaramEkta class avishkaaram_ekta = AvishkaaramEkta(model) # Define the Gradio interface iface = gr.Interface( fn=avishkaaram_ekta.question_answer, inputs=["text", "text"], outputs="text", title="Audio Question Answering", description="Ask a question about a given passage using audio input", examples=[ ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania, to study the behavior of chimpanzees in the wild."], ["What was the purpose of Dr. Jane Goodall's visit to Gombe?"] ] ) # Launch the interface iface.launch()