import argparse
import json
import torch
from ultralytics import YOLO
from utils.identify_speaker import get_predictions_from_yolo
from utils.panel_ordering import panel_ordering
from utils.utilities import show_prediction_per_page
from utils.utilities import show_panel_order
from utils.character_to_character import count_characters_single_page
from utils.identify_speaker import identify_speaker,load_trained_speaker_model
from utils.utilities import visualize_speaker_results
from utils.utilities import process_results
from transformers import pipeline
from utils.transcript_page import transcript_page
from utils.visual_llm_gender import visual_llm_gender
from utils.utilities import get_speakers
from scipy.io.wavfile import write
from utils.create_audiobook import create_audiobook
# import sounddevice as sd
# import soundfile as sf

# def main():
#     parser = argparse.ArgumentParser(description="Script to transcript one single image")
#     # parser.add_argument("filename", help="Path to the input file")
#     parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
#     parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")

#     args = parser.parse_args()
#     print(f"Test image at : {args.test_image}")
#     print(f"Test image at : {args.visualize}")
def main():
    parser = argparse.ArgumentParser(description="Script to transcript one single image")
    parser.add_argument(
        "-img", "--test_image",
        type=str,
        default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
        help="Test Image on which transcription needs to be done"
    )
    parser.add_argument(
        "-vis", "--visualize",
        action="store_true",
        help="Provide only flagname to Visualize predictions"
    )

    args = parser.parse_args()
    print(f"Test image at : {args.test_image}")
    print(f"Visualize? : {args.visualize}")

    config = {}
    with open('config.json') as f:
        config = json.load(f)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    yolo_model = YOLO(config["yolo_path"])
    test_image = args.test_image

    predictions = get_predictions_from_yolo(test_image, yolo_model)

    boxOrderEstimator = panel_ordering(test_image,predictions)

    unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device=device)

    assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
    results,predictions = identify_speaker(
                assoc_model, test_image, yolo_model, device
            )
    processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)

    # print("Panels predicted present in predictions.panels")
    # for panel in predictions.panels:
    #     print(panel)
    # print("Sppech bubbles predicted present in predictions.speech_bubbles")
    # for panel in predictions.speech_bubbles:
    #     print(panel)
    # print("Faces predicted present in predictions.faces")
    # for panel in predictions.faces:
    #     print(panel)
    # print("Bodies predicted present in predictions.bodies")
    # for panel in predictions.bodies:
    #     print(panel)
    translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
    japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
    # gender_list = visual_llm_gender(unique_chars,predictions,test_image)
    # speaker_list = get_speakers(gender_list)
    speaker_list = ['Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı']
    combined,sample_rate = create_audiobook(english_transcript,speaker_list)
    write("output.wav", sample_rate, combined)

    # data, fs = sf.read('output.wav')
    # sd.play(data, fs)
    # sd.wait()
    
    if args.visualize :
        show_prediction_per_page(test_image,processed_predictions)
        show_panel_order(test_image,boxOrderEstimator)
        visualize_speaker_results(test_image,processed_predictions,results)


if __name__ == "__main__":
    main()