Spaces:

janmayjay
/

Sonofica

Running

App Files Files Community

Sonofica / page_transcription.py

janmayjay

Add application file

39a7537 about 1 month ago

raw

history blame

4.2 kB

	import argparse
	import json
	import torch
	from ultralytics import YOLO
	from utils.identify_speaker import get_predictions_from_yolo
	from utils.panel_ordering import panel_ordering
	from utils.utilities import show_prediction_per_page
	from utils.utilities import show_panel_order
	from utils.character_to_character import count_characters_single_page
	from utils.identify_speaker import identify_speaker,load_trained_speaker_model
	from utils.utilities import visualize_speaker_results
	from utils.utilities import process_results
	from transformers import pipeline
	from utils.transcript_page import transcript_page
	from utils.visual_llm_gender import visual_llm_gender
	from utils.utilities import get_speakers
	from scipy.io.wavfile import write
	from utils.create_audiobook import create_audiobook
	import sounddevice as sd
	import soundfile as sf

	# def main():
	# parser = argparse.ArgumentParser(description="Script to transcript one single image")
	# # parser.add_argument("filename", help="Path to the input file")
	# parser.add_argument("-img", "--test_image", type=str, default="./dataset/images/test/AkkeraKanjinchou_083.jpg", help="Test Image on which transcription needs to be done")
	# parser.add_argument("-vis", "--visualize", type=bool, default=False, help="Visualize predictions")

	# args = parser.parse_args()
	# print(f"Test image at : {args.test_image}")
	# print(f"Test image at : {args.visualize}")
	def main():
	parser = argparse.ArgumentParser(description="Script to transcript one single image")
	parser.add_argument(
	"-img", "--test_image",
	type=str,
	default="./dataset/images/test/AkkeraKanjinchou_083.jpg",
	help="Test Image on which transcription needs to be done"
	)
	parser.add_argument(
	"-vis", "--visualize",
	action="store_true",
	help="Provide only flagname to Visualize predictions"
	)

	args = parser.parse_args()
	print(f"Test image at : {args.test_image}")
	print(f"Visualize? : {args.visualize}")

	config = {}
	with open('config.json') as f:
	config = json.load(f)
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	yolo_model = YOLO(config["yolo_path"])
	test_image = args.test_image

	predictions = get_predictions_from_yolo(test_image, yolo_model)

	boxOrderEstimator = panel_ordering(test_image,predictions)

	unique_chars, labels,pairs = count_characters_single_page(test_image,predictions,eps=0.3,min_samples=1,device='cuda')

	assoc_model = load_trained_speaker_model(config['speaker_association_path'], device)
	results,predictions = identify_speaker(
	assoc_model, test_image, yolo_model, 'cuda'
	)
	processed_predictions = process_results(predictions,boxOrderEstimator,pairs,labels,results)

	# print("Panels predicted present in predictions.panels")
	# for panel in predictions.panels:
	# print(panel)
	# print("Sppech bubbles predicted present in predictions.speech_bubbles")
	# for panel in predictions.speech_bubbles:
	# print(panel)
	# print("Faces predicted present in predictions.faces")
	# for panel in predictions.faces:
	# print(panel)
	# print("Bodies predicted present in predictions.bodies")
	# for panel in predictions.bodies:
	# print(panel)
	translator = pipeline('translation', model='Mitsua/elan-mt-bt-ja-en')
	japanese_transcript,english_transcript = transcript_page(test_image,translator,processed_predictions)
	gender_list = visual_llm_gender(unique_chars,predictions,test_image)
	speaker_list = get_speakers(gender_list)
	combined,sample_rate = create_audiobook(gender_list,english_transcript,speaker_list)
	write("output.wav", sample_rate, combined)

	data, fs = sf.read('output.wav')
	sd.play(data, fs)
	sd.wait()

	if args.visualize :
	show_prediction_per_page(test_image,processed_predictions)
	show_panel_order(test_image,boxOrderEstimator)
	visualize_speaker_results(test_image,processed_predictions,results)


	if __name__ == "__main__":
	main()