| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						This script can be used to preprocess Spoken Wikipedia corpus before running ctc-segmentation. | 
					
					
						
						| 
							 | 
						The input folder consists of subfolders with following stricture | 
					
					
						
						| 
							 | 
						  ├── <Name of Wikipedia article> | 
					
					
						
						| 
							 | 
						  │   ├── aligned.swc | 
					
					
						
						| 
							 | 
						  │   ├── audiometa.txt | 
					
					
						
						| 
							 | 
						  │   ├── audio.ogg | 
					
					
						
						| 
							 | 
						  │   ├── info.json | 
					
					
						
						| 
							 | 
						  │   ├── wiki.html | 
					
					
						
						| 
							 | 
						  │   ├── wiki.txt | 
					
					
						
						| 
							 | 
						  │   └── wiki.xml | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						## The destination folder will contain look enumerated .ogg and .txt files like this: | 
					
					
						
						| 
							 | 
						  ├── audio | 
					
					
						
						| 
							 | 
						  |   ├── 1.ogg | 
					
					
						
						| 
							 | 
						  |   ├── 2.ogg | 
					
					
						
						| 
							 | 
						  |   ... | 
					
					
						
						| 
							 | 
						  └── text | 
					
					
						
						| 
							 | 
						      ├── 1.txt      | 
					
					
						
						| 
							 | 
						      ├── 2.txt      | 
					
					
						
						| 
							 | 
						      ... | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import argparse | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import re | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						parser = argparse.ArgumentParser() | 
					
					
						
						| 
							 | 
						parser.add_argument( | 
					
					
						
						| 
							 | 
						    "--input_folder", required=True, type=str, help="Input folder in which each subfolder contains an article" | 
					
					
						
						| 
							 | 
						) | 
					
					
						
						| 
							 | 
						parser.add_argument( | 
					
					
						
						| 
							 | 
						    "--destination_folder", required=True, type=str, help="Destination folder with audio and text subfolder" | 
					
					
						
						| 
							 | 
						) | 
					
					
						
						| 
							 | 
						args = parser.parse_args() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def replace_diacritics(text): | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[éèëēêęěė]", "e", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ãâāáäăâàąåạả]", "a", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[úūüùưûů]", "u", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ôōóöõòő]", "o", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ćçč]", "c", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ïīíîıì]", "i", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ñńňņ]", "n", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[țť]", "t", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[łľ]", "l", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[żžź]", "z", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ğ]", "g", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ř]", "r", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[ý]", "y", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[æ]", "ae", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[œ]", "oe", text) | 
					
					
						
						| 
							 | 
						    text = re.sub(r"[șşšś]", "s", text) | 
					
					
						
						| 
							 | 
						    return text | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def get_audio(name, n): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    Copies .ogg file. If there are several .ogg files, concatenates them. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						    Args: | 
					
					
						
						| 
							 | 
						        name - name of folder within Spoken Wikipedia | 
					
					
						
						| 
							 | 
						        n - integer that will serve as output file name, e.g. if n=1, file 1.ogg will be created   | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    audio_path = os.path.join(args.input_folder, name, "audio.ogg") | 
					
					
						
						| 
							 | 
						    if not os.path.exists(audio_path): | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        multiple_ogg_files = [] | 
					
					
						
						| 
							 | 
						        for i in range(1, 5): | 
					
					
						
						| 
							 | 
						            path = os.path.join(args.input_folder, name, "audio" + str(i) + ".ogg") | 
					
					
						
						| 
							 | 
						            if os.path.exists(path): | 
					
					
						
						| 
							 | 
						                multiple_ogg_files.append(path) | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                break | 
					
					
						
						| 
							 | 
						        if len(multiple_ogg_files) == 0: | 
					
					
						
						| 
							 | 
						            return | 
					
					
						
						| 
							 | 
						        elif len(multiple_ogg_files) == 1: | 
					
					
						
						| 
							 | 
						            os.system("cp \"" + multiple_ogg_files[0] + "\" \"" + audio_path + "\"") | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            tmp_file_name = "ffmeg_inputs.txt" | 
					
					
						
						| 
							 | 
						            print("tmp_file_name=", tmp_file_name) | 
					
					
						
						| 
							 | 
						            with open(tmp_file_name, "w", encoding="utf-8") as tmp_file: | 
					
					
						
						| 
							 | 
						                for path in multiple_ogg_files: | 
					
					
						
						| 
							 | 
						                    tmp_file.write("file '" + path + "'\n") | 
					
					
						
						| 
							 | 
						            cmd = "ffmpeg -f concat -i \"" + tmp_file_name + "\" -c copy \"" + audio_path + "\"" | 
					
					
						
						| 
							 | 
						            print(cmd) | 
					
					
						
						| 
							 | 
						            os.system(cmd) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    output_audio_path = args.destination_folder + "/audio/" + str(n) + ".ogg" | 
					
					
						
						| 
							 | 
						    os.system("cp \"" + audio_path + "\" " + output_audio_path) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def get_text(name, n): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    Cleans wiki.txt. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						    Args: | 
					
					
						
						| 
							 | 
						        name - name of folder within Spoken Wikipedia | 
					
					
						
						| 
							 | 
						        n - integer that will serve as output file name, e.g. if n=1, file 1.txt will be created   | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    out_text = open(args.destination_folder + "/text/" + str(n) + ".txt", "w", encoding="utf-8") | 
					
					
						
						| 
							 | 
						    with open(args.input_folder + "/" + name + "/wiki.txt", "r", encoding="utf-8") as f: | 
					
					
						
						| 
							 | 
						        for line in f: | 
					
					
						
						| 
							 | 
						            do_break = False | 
					
					
						
						| 
							 | 
						            line2 = line.strip() | 
					
					
						
						| 
							 | 
						            ref_parts = line2.split("<ref") | 
					
					
						
						| 
							 | 
						            for idx, s in enumerate(ref_parts): | 
					
					
						
						| 
							 | 
						                if idx != 0: | 
					
					
						
						| 
							 | 
						                    s = "<ref" + s | 
					
					
						
						| 
							 | 
						                if s.startswith("[[Image") and s.endswith("]]"): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                if s.startswith("[[File") and s.endswith("]]"): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                if s.startswith(":"): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                if s.startswith("{|") or s.startswith("|}") or s.startswith("|") or s.startswith("!"): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                if s.startswith("{{") and (s.endswith("}}") or "}}" not in s): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                if s.startswith("{{pp-move"): | 
					
					
						
						| 
							 | 
						                    continue | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\[\[Image\:[^\]]+\]\]", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\[\[File\:[^\]]+\]\]", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\[http[^\]]+\]", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<math>[^<>]+</math>", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<!\-\-.+\-\->", r"", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<ref>.+</ref>", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<ref .+</ref>", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<ref[^<>]+/>", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"<[^ <>]+>", r"", s)   | 
					
					
						
						| 
							 | 
						                if ( | 
					
					
						
						| 
							 | 
						                    re.match(r"== *Notes *==", s) | 
					
					
						
						| 
							 | 
						                    or re.match(r"== *References *==", s) | 
					
					
						
						| 
							 | 
						                    or re.match(r"== *External links *==", s) | 
					
					
						
						| 
							 | 
						                    or re.match(r"== *See also *==", s) | 
					
					
						
						| 
							 | 
						                ): | 
					
					
						
						| 
							 | 
						                    do_break = True | 
					
					
						
						| 
							 | 
						                    break | 
					
					
						
						| 
							 | 
						                s = re.sub(r"{{convert\|(\d+)\|(\w+)\|[^}]+}}", r"\g<1> \g<2>", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"{{cquote\|", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"{{[^{}]+}}", r"", s) | 
					
					
						
						| 
							 | 
						                s = s.replace("{{", "").replace("}}", "") | 
					
					
						
						| 
							 | 
						                s = re.sub(r"(lang[^()]+)", r"", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"==+", r"", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"''+", r" ", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r" '", r" ", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"' ", r" ", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"[…\*]", r" ", s) | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\\u....", r" ", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"&[^ ;&]+;", r"", s)   | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                s = replace_diacritics(s) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\[\[[^\]]+\|([^\]]+)\]\]", r"\g<1>", s)   | 
					
					
						
						| 
							 | 
						                s = re.sub(r"\[\[([^\]]+)\]\]", r"\g<1>", s) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                out_text.write(s + "\n") | 
					
					
						
						| 
							 | 
						            if do_break: | 
					
					
						
						| 
							 | 
						                break | 
					
					
						
						| 
							 | 
						    out_text.close() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if __name__ == "__main__": | 
					
					
						
						| 
							 | 
						    n = 0 | 
					
					
						
						| 
							 | 
						    for name in os.listdir(args.input_folder): | 
					
					
						
						| 
							 | 
						        n += 1 | 
					
					
						
						| 
							 | 
						        if not os.path.exists(args.input_folder + "/" + name + "/wiki.txt"): | 
					
					
						
						| 
							 | 
						            print("wiki.txt does not exist in " + name) | 
					
					
						
						| 
							 | 
						            continue | 
					
					
						
						| 
							 | 
						        get_audio(name, n) | 
					
					
						
						| 
							 | 
						        get_text(name, n) | 
					
					
						
						| 
							 | 
						
 |