|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import csv |
|
|
import json |
|
|
from argparse import ArgumentParser |
|
|
from glob import glob |
|
|
from typing import List, Tuple |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
""" |
|
|
Converts WikiHomograph data to .json manifest format for HeteronymClassificationModel training. |
|
|
WikiHomograph dataset could be found here: |
|
|
https://github.com/google-research-datasets/WikipediaHomographData |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = ArgumentParser() |
|
|
parser.add_argument('--data_folder', help="Path to data folder with .tsv files", type=str, required=True) |
|
|
parser.add_argument("--output", help="Path to output .json file to store the data", type=str, required=True) |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def read_wikihomograph_file(file: str) -> Tuple[List[str], List[List[int]], List[str], List[str]]: |
|
|
""" |
|
|
Reads .tsv file from WikiHomograph dataset, |
|
|
e.g. https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/eval/live.tsv |
|
|
|
|
|
Args: |
|
|
file: path to .tsv file |
|
|
Returns: |
|
|
sentences: Text. |
|
|
start_end_indices: Start and end indices of the homograph in the sentence. |
|
|
heteronyms: Target heteronyms for each sentence |
|
|
word_ids: Word_ids corresponding to each heteronym, i.e. label. |
|
|
""" |
|
|
excluded_sentences = 0 |
|
|
sentences = [] |
|
|
start_end_indices = [] |
|
|
heteronyms = [] |
|
|
word_ids = [] |
|
|
with open(file, "r", encoding="utf-8") as f: |
|
|
tsv_file = csv.reader(f, delimiter="\t") |
|
|
for i, line in enumerate(tsv_file): |
|
|
if i == 0: |
|
|
continue |
|
|
heteronym, wordid, sentence, start, end = line |
|
|
start, end = int(start), int(end) |
|
|
sentence, start, end = correct_wikihomograph_data(sentence, start, end) |
|
|
|
|
|
heteronym_span = sentence[start:end] |
|
|
if heteronym_span.lower() != heteronym: |
|
|
if sentence.lower().count(heteronym) == 1: |
|
|
start = sentence.lower().index(heteronym) |
|
|
end = start + len(heteronym) |
|
|
heteronym_span = sentence[start:end].lower() |
|
|
assert heteronym == heteronym_span.lower() |
|
|
else: |
|
|
excluded_sentences += 1 |
|
|
raise ValueError(f"heteronym {heteronym} != heteronym_span {heteronym_span} in {sentence}") |
|
|
|
|
|
heteronyms.append(heteronym) |
|
|
start_end_indices.append([start, end]) |
|
|
sentences.append(sentence) |
|
|
word_ids.append(wordid) |
|
|
|
|
|
return sentences, start_end_indices, heteronyms, word_ids |
|
|
|
|
|
|
|
|
def correct_wikihomograph_data(sentence: str, start: int = None, end: int = None): |
|
|
""" |
|
|
Correct indices for WikiHomograph data |
|
|
|
|
|
Args: |
|
|
sentence: sentence |
|
|
start: start index of homograph |
|
|
end: end index of homograph |
|
|
|
|
|
""" |
|
|
corrections = { |
|
|
"It is traditionally composed of 85–99% tin, mixed with copper, antimony, bismuth, and sometimes lead, although the use of lead is less common today.": [ |
|
|
96, |
|
|
100, |
|
|
], |
|
|
"B₁₀₅ can be conceptually divided into a B₄₈ fragment and B₂₈-B-B₂₈ (B₅₇) fragment.": [44, 52], |
|
|
"Pierrefonds Airport on Réunion recorded just 18 mm (0.71 in) of rainfall from November to January, a record minimum.": [ |
|
|
101, |
|
|
107, |
|
|
], |
|
|
"Consort Chen Farong (陳法容) was an imperial consort during the Chinese dynasty Liu Song.": [42, 49], |
|
|
"Unlike TiO₂, which features six-coordinate Ti in all phases, monoclinic zirconia consists of seven-coordinate zirconium centres.": [ |
|
|
32, |
|
|
42, |
|
|
], |
|
|
"Its area is 16 km², its approximate length is 10 km, and its approximate width is 3 km.": [24, 35], |
|
|
"The conjugate momentum to X has the expressionwhere the pᵢ are the momentum functions conjugate to the coordinates.": [ |
|
|
86, |
|
|
95, |
|
|
], |
|
|
"Furthermore 17β-HSD1 levels positively correlate with E2 and negatively correlate with DHT levels in breast cancer cells.": [ |
|
|
39, |
|
|
48, |
|
|
], |
|
|
"Electric car buyers get a €4,000 (US$4,520) discount while buyers of plug-in hybrid vehicles get a discount of €3,000 (US$3,390).": [ |
|
|
99, |
|
|
107, |
|
|
], |
|
|
} |
|
|
|
|
|
if sentence in corrections: |
|
|
start, end = corrections[sentence] |
|
|
|
|
|
sentence = sentence.replace("2014Coordinate", "2014 Coordinate") |
|
|
sentence = sentence.replace("AAA", "triple A") |
|
|
|
|
|
return sentence, start, end |
|
|
|
|
|
|
|
|
def convert_wikihomograph_data_to_manifest(data_folder: str, output_manifest: str): |
|
|
""" |
|
|
Convert WikiHomograph data to .json manifest |
|
|
|
|
|
Args: |
|
|
data_folder: data_folder that contains .tsv files |
|
|
output_manifest: path to output file |
|
|
""" |
|
|
with open(output_manifest, "w") as f_out: |
|
|
for file in tqdm(glob(f"{data_folder}/*.tsv")): |
|
|
sentences, start_end_indices, heteronyms, word_ids = read_wikihomograph_file(file) |
|
|
for i, sent in enumerate(sentences): |
|
|
start, end = start_end_indices[i] |
|
|
heteronym_span = sent[start:end] |
|
|
entry = { |
|
|
"text_graphemes": sent, |
|
|
"start_end": [start, end], |
|
|
"heteronym_span": heteronym_span, |
|
|
"word_id": word_ids[i], |
|
|
} |
|
|
f_out.write(json.dumps(entry, ensure_ascii=False) + "\n") |
|
|
print(f"Data saved at {output_manifest}") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
args = parse_args() |
|
|
convert_wikihomograph_data_to_manifest(args.data_folder, args.output) |
|
|
|