import argparse import json import torch from ultralytics import YOLO from utils.identify_speaker import get_predictions_from_yolo from utils.panel_ordering import panel_ordering from utils.utilities import show_prediction_per_page from utils.utilities import show_panel_order from utils.character_to_character import count_characters_single_page from utils.identify_speaker import identify_speaker,load_trained_speaker_model from utils.utilities import visualize_speaker_results from utils.utilities import process_results from transformers import pipeline from utils.transcript_page import transcript_page from utils.visual_llm_gender import visual_llm_gender from utils.utilities import get_speakers from scipy.io.wavfile import write from utils.create_audiobook import create_audiobook # import sounddevice as sd # import soundfile as sf # def main(): # parser = argparse.ArgumentParser(description="Script to transcript one single image") # # parser.add_argument("filename", help="Path to the input file") # parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done") # parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions") # args = parser.parse_args() # print(f"Test image at : {args.test_image}") # print(f"Test image at : {args.visualize}") def main(): parser = argparse.ArgumentParser(description="Script to transcript one single image") parser.add_argument( "-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done" ) parser.add_argument( "-vis", "--visualize", action="store_true", help="Provide only flagname to Visualize predictions" ) args = parser.parse_args() print(f"Test image at : {args.test_image}") print(f"Visualize? : {args.visualize}") config = {} with open('config.json') as f: config = json.load(f) device = 'cuda' if torch.cuda.is_available() else 'cpu' yolo_model = YOLO(config["yolo_path"]) test_image = args.test_image predictions = get_predictions_from_yolo(test_image, yolo_model) boxOrderEstimator = panel_ordering(test_image,predictions) unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device=device) assoc_model = load_trained_speaker_model(config['speaker_association_path'], device) results,predictions = identify_speaker( assoc_model, test_image, yolo_model, device ) processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results) # print("Panels predicted present in predictions.panels") # for panel in predictions.panels: # print(panel) # print("Sppech bubbles predicted present in predictions.speech_bubbles") # for panel in predictions.speech_bubbles: # print(panel) # print("Faces predicted present in predictions.faces") # for panel in predictions.faces: # print(panel) # print("Bodies predicted present in predictions.bodies") # for panel in predictions.bodies: # print(panel) translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en') japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions) # gender_list = visual_llm_gender(unique_chars,predictions,test_image) # speaker_list = get_speakers(gender_list) speaker_list = ['Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı'] combined,sample_rate = create_audiobook(english_transcript,speaker_list) write("output.wav", sample_rate, combined) # data, fs = sf.read('output.wav') # sd.play(data, fs) # sd.wait() if args.visualize : show_prediction_per_page(test_image,processed_predictions) show_panel_order(test_image,boxOrderEstimator) visualize_speaker_results(test_image,processed_predictions,results) if __name__ == "__main__": main()