Sonofica / page_transcription.py
janmayjay's picture
Add application file
39a7537
raw
history blame
4.2 kB
import argparse
import json
import torch
from ultralytics import YOLO
from utils.identify_speaker import get_predictions_from_yolo
from utils.panel_ordering import panel_ordering
from utils.utilities import show_prediction_per_page
from utils.utilities import show_panel_order
from utils.character_to_character import count_characters_single_page
from utils.identify_speaker import identify_speaker,load_trained_speaker_model
from utils.utilities import visualize_speaker_results
from utils.utilities import process_results
from transformers import pipeline
from utils.transcript_page import transcript_page
from utils.visual_llm_gender import visual_llm_gender
from utils.utilities import get_speakers
from scipy.io.wavfile import write
from utils.create_audiobook import create_audiobook
import sounddevice as sd
import soundfile as sf
# def main():
# parser = argparse.ArgumentParser(description="Script to transcript one single image")
# # parser.add_argument("filename", help="Path to the input file")
# parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
# parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")
# args = parser.parse_args()
# print(f"Test image at : {args.test_image}")
# print(f"Test image at : {args.visualize}")
def main():
parser = argparse.ArgumentParser(description="Script to transcript one single image")
parser.add_argument(
"-img", "--test_image",
type=str,
default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
help="Test Image on which transcription needs to be done"
)
parser.add_argument(
"-vis", "--visualize",
action="store_true",
help="Provide only flagname to Visualize predictions"
)
args = parser.parse_args()
print(f"Test image at : {args.test_image}")
print(f"Visualize? : {args.visualize}")
config = {}
with open('config.json') as f:
config = json.load(f)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
yolo_model = YOLO(config["yolo_path"])
test_image = args.test_image
predictions = get_predictions_from_yolo(test_image, yolo_model)
boxOrderEstimator = panel_ordering(test_image,predictions)
unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')
assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
results,predictions = identify_speaker(
assoc_model, test_image, yolo_model, 'cuda'
)
processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)
# print("Panels predicted present in predictions.panels")
# for panel in predictions.panels:
# print(panel)
# print("Sppech bubbles predicted present in predictions.speech_bubbles")
# for panel in predictions.speech_bubbles:
# print(panel)
# print("Faces predicted present in predictions.faces")
# for panel in predictions.faces:
# print(panel)
# print("Bodies predicted present in predictions.bodies")
# for panel in predictions.bodies:
# print(panel)
translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
gender_list = visual_llm_gender(unique_chars,predictions,test_image)
speaker_list = get_speakers(gender_list)
combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
write("output.wav", sample_rate, combined)
data, fs = sf.read('output.wav')
sd.play(data, fs)
sd.wait()
if args.visualize :
show_prediction_per_page(test_image,processed_predictions)
show_panel_order(test_image,boxOrderEstimator)
visualize_speaker_results(test_image,processed_predictions,results)
if __name__ == "__main__":
main()