|
import argparse
|
|
import json
|
|
import torch
|
|
from ultralytics import YOLO
|
|
from utils.identify_speaker import get_predictions_from_yolo
|
|
from utils.panel_ordering import panel_ordering
|
|
from utils.utilities import show_prediction_per_page
|
|
from utils.utilities import show_panel_order
|
|
from utils.character_to_character import count_characters_single_page
|
|
from utils.identify_speaker import identify_speaker,load_trained_speaker_model
|
|
from utils.utilities import visualize_speaker_results
|
|
from utils.utilities import process_results
|
|
from transformers import pipeline
|
|
from utils.transcript_page import transcript_page
|
|
from utils.visual_llm_gender import visual_llm_gender
|
|
from utils.utilities import get_speakers
|
|
from scipy.io.wavfile import write
|
|
from utils.create_audiobook import create_audiobook
|
|
import sounddevice as sd
|
|
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Script to transcript one single image")
|
|
parser.add_argument(
|
|
"-img", "--test_image",
|
|
type=str,
|
|
default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
|
|
help="Test Image on which transcription needs to be done"
|
|
)
|
|
parser.add_argument(
|
|
"-vis", "--visualize",
|
|
action="store_true",
|
|
help="Provide only flagname to Visualize predictions"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
print(f"Test image at : {args.test_image}")
|
|
print(f"Visualize? : {args.visualize}")
|
|
|
|
config = {}
|
|
with open('config.json') as f:
|
|
config = json.load(f)
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
yolo_model = YOLO(config["yolo_path"])
|
|
test_image = args.test_image
|
|
|
|
predictions = get_predictions_from_yolo(test_image, yolo_model)
|
|
|
|
boxOrderEstimator = panel_ordering(test_image,predictions)
|
|
|
|
unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')
|
|
|
|
assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
|
|
results,predictions = identify_speaker(
|
|
assoc_model, test_image, yolo_model, 'cuda'
|
|
)
|
|
processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
|
|
japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
|
|
gender_list = visual_llm_gender(unique_chars,predictions,test_image)
|
|
speaker_list = get_speakers(gender_list)
|
|
combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
|
|
write("output.wav", sample_rate, combined)
|
|
|
|
data, fs = sf.read('output.wav')
|
|
sd.play(data, fs)
|
|
sd.wait()
|
|
|
|
if args.visualize :
|
|
show_prediction_per_page(test_image,processed_predictions)
|
|
show_panel_order(test_image,boxOrderEstimator)
|
|
visualize_speaker_results(test_image,processed_predictions,results)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
|