File size: 1,673 Bytes
39a7537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# Extract speaker info and add to parsed-XML json object
import xml.etree.ElementTree as ET
import manga109api
import os
import json
def xml_to_json(config):
manga109_root_dir = config["manga109_root_dir"]
def create_json(book):
def parse_xml(xml_path):
tree = ET.parse(xml_path)
return tree.getroot()
root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
print(len(root_speaker.find('pages')))
fullPage = []
for page in root_speaker.find('pages'):
index = page.attrib.get('index')
width = page.attrib.get('width')
height = page.attrib.get('height')
curpage = {}
for speaker in page.findall('speaker_to_text'):
id = speaker.attrib.get('id')
text_id = speaker.attrib.get('text_id')
speaker_id = speaker.attrib.get('speaker_id')
curpage[text_id] = speaker_id
fullPage.append(curpage)
p = manga109api.Parser(root_dir=manga109_root_dir)
annotation = p.get_annotation(book=book)
for page in annotation['page']:
index = page['@index']
for obj in page['text']:
if obj['@id'] in fullPage[index]:
obj['@bodyid'] = fullPage[index][obj['@id']]
with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
json.dump(annotation,f,ensure_ascii=False, indent=2)
for book in os.listdir(os.path.join(manga109_root_dir,"images")):
print(book)
create_json(book) |