# import sounddevice as sd
# import soundfile as sf
# import speech_recognition as sr
# from gtts import gTTS
# import pygame
# import time
# import gradio as gr

# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')

# class AvishkaaramEkta:
#     def __init__(self, model):
#         self.model = model
#         self.tokenizer = tokenizer
    
#     def text_to_speech(self, text, output_file):
#         # Create a gTTS object with the text and desired language
#         tts = gTTS(text=text, lang='en')

#         # Save the audio to a file
#         tts.save(output_file)
        
#     def play_mp3(self, file_path):
#         pygame.mixer.init()
#         pygame.mixer.music.load(file_path)
#         pygame.mixer.music.play()
#         while pygame.mixer.music.get_busy():
#             continue

#     def ask_question(self, audio_file):
#         print("Recording audio...")
#         audio = sd.rec(int(44100 * 6), samplerate=44100, channels=1)
#         sd.wait()

#         # Save the audio to a file
#         sf.write(audio_file, audio, 44100)

#         print(f"Audio saved to {audio_file}")
#         r = sr.Recognizer()

#         with sr.AudioFile(audio_file) as source:
#             audio_data = r.record(source)
        
#         text = ""

#         try:
#             text = r.recognize_google(audio_data)
#             print("Transcription:", text)
#         except sr.UnknownValueError:
#             print("Speech recognition could not understand audio")
#         except sr.RequestError as e:
#             print("Could not request results from Google Speech Recognition service; {0}".format(e))

#         return text
    
#     def answer_question(self, passage, question):
#         inputs = self.tokenizer(passage, question, return_tensors="pt")
#         outputs = self.model(**inputs)
#         start_logits = outputs.start_logits
#         end_logits = outputs.end_logits
#         start_index = start_logits.argmax(dim=1).item()
#         end_index = end_logits.argmax(dim=1).item()
#         tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
#         return answer

#     def question_answer(self, passage, question):
#         passage_audio_file = "passage.mp3"
#         question_audio_file = "question.wav"
#         answer_audio_file = "answer.mp3"

#         self.text_to_speech(passage, passage_audio_file)
#         self.play_mp3(passage_audio_file)

#         question_text = self.ask_question(question_audio_file)
#         answer = self.answer_question(passage, question_text)

#         self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
#         self.play_mp3(answer_audio_file)

#         time.sleep(5)  # Wait for 5 seconds before ending

#         return answer

# # Create an instance of the AvishkaaramEkta class
# avishkaaram_ekta = AvishkaaramEkta(model)

# # Define the Gradio interface
# iface = gr.Interface(
#     fn=avishkaaram_ekta.question_answer,
#     inputs=["text", "text"],
#     outputs="text",
#     title="Audio Question Answering",
#     description="Ask a question about a given passage using audio input",
#     examples=[
#         ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
#         ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
#     ],
#     interpretation="default",
# )

# # Launch the Gradio interface
# iface.launch()


# import torch
# import torchaudio
# import soundfile as sf
# import speech_recognition as sr
# from gtts import gTTS
# import pygame
# import time
# import gradio as gr
# import os

# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
# os.environ['SDL_AUDIODRIVER'] = 'dsp'
# class AvishkaaramEkta:
#     def __init__(self, model):
#         self.model = model
#         self.tokenizer = tokenizer
    
#     def text_to_speech(self, text, output_file):
#         # Create a gTTS object with the text and desired language
#         tts = gTTS(text=text, lang='en')

#         # Save the audio to a file
#         tts.save(output_file)
        
#     def play_mp3(self, file_path):
#         pygame.mixer.init()
#         pygame.mixer.music.load(file_path)
#         pygame.mixer.music.play()
#         while pygame.mixer.music.get_busy():
#             continue

#     def ask_question(self, audio_file):
#         print("Recording audio...")
#         waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1)

#         # Save the audio to a file
#         sf.write(audio_file, waveform.squeeze().numpy(), sample_rate)

#         print(f"Audio saved to {audio_file}")
#         r = sr.Recognizer()

#         with sr.AudioFile(audio_file) as source:
#             audio_data = r.record(source)
        
#         text = ""

#         try:
#             text = r.recognize_google(audio_data)
#             print("Transcription:", text)
#         except sr.UnknownValueError:
#             print("Speech recognition could not understand audio")
#         except sr.RequestError as e:
#             print("Could not request results from Google Speech Recognition service; {0}".format(e))

#         return text
    
#     def answer_question(self, passage, question):
#         inputs = self.tokenizer(passage, question, return_tensors="pt")
#         outputs = self.model(**inputs)
#         start_logits = outputs.start_logits
#         end_logits = outputs.end_logits
#         start_index = start_logits.argmax(dim=1).item()
#         end_index = end_logits.argmax(dim=1).item()
#         tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
#         return answer

#     def question_answer(self, passage, question):
#         passage_audio_file = "passage.mp3"
#         question_audio_file = "question.wav"
#         answer_audio_file = "answer.mp3"

#         self.text_to_speech(passage, passage_audio_file)
#         self.play_mp3(passage_audio_file)

#         question_text = self.ask_question(question_audio_file)
#         answer = self.answer_question(passage, question_text)

#         self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
#         self.play_mp3(answer_audio_file)

#         time.sleep(5)  # Wait for 5 seconds before ending

#         return answer

# # Create an instance of the AvishkaaramEkta class
# avishkaaram_ekta = AvishkaaramEkta(model)

# # Define the Gradio interface
# iface = gr.Interface(
#     fn=avishkaaram_ekta.question_answer,
#     inputs=["text", "text"],
#     outputs="text",
#     title="Audio Question Answering",
#     description="Ask a question about a given passage using audio input",
#     examples=[
#         ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
#         ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
#     ],
#     interpretation="default",
# )

# # Launch the Gradio interface
# iface.launch()


import torch
import torchaudio
import soundfile as sf
import speech_recognition as sr
from gtts import gTTS
import pygame
import time
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
os.environ["SDL_AUDIODRIVER"] = "pulseaudio"

model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')

class AvishkaaramEkta:
    def __init__(self, model):
        self.model = model
        self.tokenizer = tokenizer
    
    def text_to_speech(self, text, output_file):
        # Create a gTTS object with the text and desired language
        tts = gTTS(text=text, lang='en')

        # Save the audio to a file
        tts.save(output_file)
        
    def play_mp3(self, file_path):
        pygame.mixer.init()
        pygame.mixer.music.load(file_path)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            continue

    def ask_question(self, audio_file):
        print("Recording audio...")
        waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1)

        # Save the audio to a file
        sf.write(audio_file, waveform.squeeze().numpy(), sample_rate)

        print(f"Audio saved to {audio_file}")
        r = sr.Recognizer()

        with sr.AudioFile(audio_file) as source:
            audio_data = r.record(source)
        
        text = ""

        try:
            text = r.recognize_google(audio_data)
            print("Transcription:", text)
        except sr.UnknownValueError:
            print("Speech recognition could not understand audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))

        return text
    
    def answer_question(self, passage, question):
        inputs = self.tokenizer(passage, question, return_tensors="pt")
        outputs = self.model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        start_index = start_logits.argmax(dim=1).item()
        end_index = end_logits.argmax(dim=1).item()
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
        return answer

    def question_answer(self, passage, question):
        passage_audio_file = "passage.mp3"
        question_audio_file = "question.wav"
        answer_audio_file = "answer.mp3"

        self.text_to_speech(passage, passage_audio_file)
        self.play_mp3(passage_audio_file)

        question_text = self.ask_question(question_audio_file)
        answer = self.answer_question(passage, question_text)

        self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
        self.play_mp3(answer_audio_file)

        time.sleep(5)  # Wait for 5 seconds before ending

        return answer

# Create an instance of the AvishkaaramEkta class
avishkaaram_ekta = AvishkaaramEkta(model)

# Define the Gradio interface
iface = gr.Interface(
    fn=avishkaaram_ekta.question_answer,
    inputs=["text", "text"],
    outputs="text",
    title="Audio Question Answering",
    description="Ask a question about a given passage using audio input",
    examples=[
        ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania, to study the behavior of chimpanzees in the wild."],
        ["What was the purpose of Dr. Jane Goodall's visit to Gombe?"]
    ]
)

# Launch the interface
iface.launch()