Sonofica / utils /xml_to_json.py
janmayjay's picture
Add application file
39a7537
raw
history blame
1.67 kB
# Extract speaker info and add to parsed-XML json object
import xml.etree.ElementTree as ET
import manga109api
import os
import json
def xml_to_json(config):
manga109_root_dir = config["manga109_root_dir"]
def create_json(book):
def parse_xml(xml_path):
tree = ET.parse(xml_path)
return tree.getroot()
root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
print(len(root_speaker.find('pages')))
fullPage = []
for page in root_speaker.find('pages'):
index = page.attrib.get('index')
width = page.attrib.get('width')
height = page.attrib.get('height')
curpage = {}
for speaker in page.findall('speaker_to_text'):
id = speaker.attrib.get('id')
text_id = speaker.attrib.get('text_id')
speaker_id = speaker.attrib.get('speaker_id')
curpage[text_id] = speaker_id
fullPage.append(curpage)
p = manga109api.Parser(root_dir=manga109_root_dir)
annotation = p.get_annotation(book=book)
for page in annotation['page']:
index = page['@index']
for obj in page['text']:
if obj['@id'] in fullPage[index]:
obj['@bodyid'] = fullPage[index][obj['@id']]
with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
json.dump(annotation,f,ensure_ascii=False, indent=2)
for book in os.listdir(os.path.join(manga109_root_dir,"images")):
print(book)
create_json(book)