|
import argparse |
|
import json |
|
import torch |
|
from ultralytics import YOLO |
|
from utils.identify_speaker import get_predictions_from_yolo |
|
from utils.panel_ordering import panel_ordering |
|
from utils.utilities import show_prediction_per_page |
|
from utils.utilities import show_panel_order |
|
from utils.character_to_character import count_characters_single_page |
|
from utils.identify_speaker import identify_speaker,load_trained_speaker_model |
|
from utils.utilities import visualize_speaker_results |
|
from utils.utilities import process_results |
|
from transformers import pipeline |
|
from utils.transcript_page import transcript_page |
|
from utils.visual_llm_gender import visual_llm_gender |
|
from utils.utilities import get_speakers |
|
from scipy.io.wavfile import write |
|
from utils.create_audiobook import create_audiobook |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Script to transcript one single image") |
|
parser.add_argument( |
|
"-img", "--test_image", |
|
type=str, |
|
default="./dataset/images/test/AkkeraKanjinchou_083.jpg", |
|
help="Test Image on which transcription needs to be done" |
|
) |
|
parser.add_argument( |
|
"-vis", "--visualize", |
|
action="store_true", |
|
help="Provide only flagname to Visualize predictions" |
|
) |
|
|
|
args = parser.parse_args() |
|
print(f"Test image at : {args.test_image}") |
|
print(f"Visualize? : {args.visualize}") |
|
|
|
config = {} |
|
with open('config.json') as f: |
|
config = json.load(f) |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
yolo_model = YOLO(config["yolo_path"]) |
|
test_image = args.test_image |
|
|
|
predictions = get_predictions_from_yolo(test_image, yolo_model) |
|
|
|
boxOrderEstimator = panel_ordering(test_image,predictions) |
|
|
|
unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device=device) |
|
|
|
assoc_model = load_trained_speaker_model(config['speaker_association_path'], device) |
|
results,predictions = identify_speaker( |
|
assoc_model, test_image, yolo_model, device |
|
) |
|
processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en') |
|
japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions) |
|
|
|
|
|
speaker_list = ['Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı','Eugenio_Mataracı'] |
|
combined,sample_rate = create_audiobook(english_transcript,speaker_list) |
|
write("output.wav", sample_rate, combined) |
|
|
|
|
|
|
|
|
|
|
|
if args.visualize : |
|
show_prediction_per_page(test_image,processed_predictions) |
|
show_panel_order(test_image,boxOrderEstimator) |
|
visualize_speaker_results(test_image,processed_predictions,results) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|