# Extract speaker info and add to parsed-XML json object import xml.etree.ElementTree as ET import manga109api import os import json def xml_to_json(config): manga109_root_dir = config["manga109_root_dir"] def create_json(book): def parse_xml(xml_path): tree = ET.parse(xml_path) return tree.getroot() root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml") print(len(root_speaker.find('pages'))) fullPage = [] for page in root_speaker.find('pages'): index = page.attrib.get('index') width = page.attrib.get('width') height = page.attrib.get('height') curpage = {} for speaker in page.findall('speaker_to_text'): id = speaker.attrib.get('id') text_id = speaker.attrib.get('text_id') speaker_id = speaker.attrib.get('speaker_id') curpage[text_id] = speaker_id fullPage.append(curpage) p = manga109api.Parser(root_dir=manga109_root_dir) annotation = p.get_annotation(book=book) for page in annotation['page']: index = page['@index'] for obj in page['text']: if obj['@id'] in fullPage[index]: obj['@bodyid'] = fullPage[index][obj['@id']] with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f: json.dump(annotation,f,ensure_ascii=False, indent=2) for book in os.listdir(os.path.join(manga109_root_dir,"images")): print(book) create_json(book)