|
|
|
import xml.etree.ElementTree as ET
|
|
import manga109api
|
|
import os
|
|
import json
|
|
def xml_to_json(config):
|
|
manga109_root_dir = config["manga109_root_dir"]
|
|
|
|
def create_json(book):
|
|
def parse_xml(xml_path):
|
|
tree = ET.parse(xml_path)
|
|
return tree.getroot()
|
|
|
|
root_speaker = parse_xml(manga109_root_dir+"annotations_Manga109Dialog/"+book+".xml")
|
|
|
|
print(len(root_speaker.find('pages')))
|
|
fullPage = []
|
|
for page in root_speaker.find('pages'):
|
|
index = page.attrib.get('index')
|
|
width = page.attrib.get('width')
|
|
height = page.attrib.get('height')
|
|
curpage = {}
|
|
for speaker in page.findall('speaker_to_text'):
|
|
id = speaker.attrib.get('id')
|
|
text_id = speaker.attrib.get('text_id')
|
|
speaker_id = speaker.attrib.get('speaker_id')
|
|
curpage[text_id] = speaker_id
|
|
fullPage.append(curpage)
|
|
|
|
p = manga109api.Parser(root_dir=manga109_root_dir)
|
|
|
|
annotation = p.get_annotation(book=book)
|
|
|
|
for page in annotation['page']:
|
|
index = page['@index']
|
|
for obj in page['text']:
|
|
if obj['@id'] in fullPage[index]:
|
|
obj['@bodyid'] = fullPage[index][obj['@id']]
|
|
|
|
with open(config["root"] + "json/"+book+".json","w",encoding='utf-8') as f:
|
|
json.dump(annotation,f,ensure_ascii=False, indent=2)
|
|
|
|
for book in os.listdir(os.path.join(manga109_root_dir,"images")):
|
|
print(book)
|
|
create_json(book) |