avishkaaram / app.py
Shashwat2528's picture
Update app.py
4ab54db
raw
history blame
11.2 kB
# import sounddevice as sd
# import soundfile as sf
# import speech_recognition as sr
# from gtts import gTTS
# import pygame
# import time
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# class AvishkaaramEkta:
# def __init__(self, model):
# self.model = model
# self.tokenizer = tokenizer
# def text_to_speech(self, text, output_file):
# # Create a gTTS object with the text and desired language
# tts = gTTS(text=text, lang='en')
# # Save the audio to a file
# tts.save(output_file)
# def play_mp3(self, file_path):
# pygame.mixer.init()
# pygame.mixer.music.load(file_path)
# pygame.mixer.music.play()
# while pygame.mixer.music.get_busy():
# continue
# def ask_question(self, audio_file):
# print("Recording audio...")
# audio = sd.rec(int(44100 * 6), samplerate=44100, channels=1)
# sd.wait()
# # Save the audio to a file
# sf.write(audio_file, audio, 44100)
# print(f"Audio saved to {audio_file}")
# r = sr.Recognizer()
# with sr.AudioFile(audio_file) as source:
# audio_data = r.record(source)
# text = ""
# try:
# text = r.recognize_google(audio_data)
# print("Transcription:", text)
# except sr.UnknownValueError:
# print("Speech recognition could not understand audio")
# except sr.RequestError as e:
# print("Could not request results from Google Speech Recognition service; {0}".format(e))
# return text
# def answer_question(self, passage, question):
# inputs = self.tokenizer(passage, question, return_tensors="pt")
# outputs = self.model(**inputs)
# start_logits = outputs.start_logits
# end_logits = outputs.end_logits
# start_index = start_logits.argmax(dim=1).item()
# end_index = end_logits.argmax(dim=1).item()
# tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
# return answer
# def question_answer(self, passage, question):
# passage_audio_file = "passage.mp3"
# question_audio_file = "question.wav"
# answer_audio_file = "answer.mp3"
# self.text_to_speech(passage, passage_audio_file)
# self.play_mp3(passage_audio_file)
# question_text = self.ask_question(question_audio_file)
# answer = self.answer_question(passage, question_text)
# self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
# self.play_mp3(answer_audio_file)
# time.sleep(5) # Wait for 5 seconds before ending
# return answer
# # Create an instance of the AvishkaaramEkta class
# avishkaaram_ekta = AvishkaaramEkta(model)
# # Define the Gradio interface
# iface = gr.Interface(
# fn=avishkaaram_ekta.question_answer,
# inputs=["text", "text"],
# outputs="text",
# title="Audio Question Answering",
# description="Ask a question about a given passage using audio input",
# examples=[
# ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
# ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
# ],
# interpretation="default",
# )
# # Launch the Gradio interface
# iface.launch()
# import torch
# import torchaudio
# import soundfile as sf
# import speech_recognition as sr
# from gtts import gTTS
# import pygame
# import time
# import gradio as gr
# import os
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# os.environ['SDL_AUDIODRIVER'] = 'dsp'
# class AvishkaaramEkta:
# def __init__(self, model):
# self.model = model
# self.tokenizer = tokenizer
# def text_to_speech(self, text, output_file):
# # Create a gTTS object with the text and desired language
# tts = gTTS(text=text, lang='en')
# # Save the audio to a file
# tts.save(output_file)
# def play_mp3(self, file_path):
# pygame.mixer.init()
# pygame.mixer.music.load(file_path)
# pygame.mixer.music.play()
# while pygame.mixer.music.get_busy():
# continue
# def ask_question(self, audio_file):
# print("Recording audio...")
# waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1)
# # Save the audio to a file
# sf.write(audio_file, waveform.squeeze().numpy(), sample_rate)
# print(f"Audio saved to {audio_file}")
# r = sr.Recognizer()
# with sr.AudioFile(audio_file) as source:
# audio_data = r.record(source)
# text = ""
# try:
# text = r.recognize_google(audio_data)
# print("Transcription:", text)
# except sr.UnknownValueError:
# print("Speech recognition could not understand audio")
# except sr.RequestError as e:
# print("Could not request results from Google Speech Recognition service; {0}".format(e))
# return text
# def answer_question(self, passage, question):
# inputs = self.tokenizer(passage, question, return_tensors="pt")
# outputs = self.model(**inputs)
# start_logits = outputs.start_logits
# end_logits = outputs.end_logits
# start_index = start_logits.argmax(dim=1).item()
# end_index = end_logits.argmax(dim=1).item()
# tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
# return answer
# def question_answer(self, passage, question):
# passage_audio_file = "passage.mp3"
# question_audio_file = "question.wav"
# answer_audio_file = "answer.mp3"
# self.text_to_speech(passage, passage_audio_file)
# self.play_mp3(passage_audio_file)
# question_text = self.ask_question(question_audio_file)
# answer = self.answer_question(passage, question_text)
# self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
# self.play_mp3(answer_audio_file)
# time.sleep(5) # Wait for 5 seconds before ending
# return answer
# # Create an instance of the AvishkaaramEkta class
# avishkaaram_ekta = AvishkaaramEkta(model)
# # Define the Gradio interface
# iface = gr.Interface(
# fn=avishkaaram_ekta.question_answer,
# inputs=["text", "text"],
# outputs="text",
# title="Audio Question Answering",
# description="Ask a question about a given passage using audio input",
# examples=[
# ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
# ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
# ],
# interpretation="default",
# )
# # Launch the Gradio interface
# iface.launch()
import torch
import torchaudio
import soundfile as sf
import speech_recognition as sr
from gtts import gTTS
import pygame
import time
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
os.environ["SDL_AUDIODRIVER"] = "pulseaudio"
model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
class AvishkaaramEkta:
def __init__(self, model):
self.model = model
self.tokenizer = tokenizer
def text_to_speech(self, text, output_file):
# Create a gTTS object with the text and desired language
tts = gTTS(text=text, lang='en')
# Save the audio to a file
tts.save(output_file)
def play_mp3(self, file_path):
pygame.mixer.init()
pygame.mixer.music.load(file_path)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
continue
def ask_question(self, audio_file):
print("Recording audio...")
waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1)
# Save the audio to a file
sf.write(audio_file, waveform.squeeze().numpy(), sample_rate)
print(f"Audio saved to {audio_file}")
r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio_data = r.record(source)
text = ""
try:
text = r.recognize_google(audio_data)
print("Transcription:", text)
except sr.UnknownValueError:
print("Speech recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
return text
def answer_question(self, passage, question):
inputs = self.tokenizer(passage, question, return_tensors="pt")
outputs = self.model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_index = start_logits.argmax(dim=1).item()
end_index = end_logits.argmax(dim=1).item()
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
return answer
def question_answer(self, passage, question):
passage_audio_file = "passage.mp3"
question_audio_file = "question.wav"
answer_audio_file = "answer.mp3"
self.text_to_speech(passage, passage_audio_file)
self.play_mp3(passage_audio_file)
question_text = self.ask_question(question_audio_file)
answer = self.answer_question(passage, question_text)
self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
self.play_mp3(answer_audio_file)
time.sleep(5) # Wait for 5 seconds before ending
return answer
# Create an instance of the AvishkaaramEkta class
avishkaaram_ekta = AvishkaaramEkta(model)
# Define the Gradio interface
iface = gr.Interface(
fn=avishkaaram_ekta.question_answer,
inputs=["text", "text"],
outputs="text",
title="Audio Question Answering",
description="Ask a question about a given passage using audio input",
examples=[
["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania, to study the behavior of chimpanzees in the wild."],
["What was the purpose of Dr. Jane Goodall's visit to Gombe?"]
]
)
# Launch the interface
iface.launch()