# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This script can be used to preprocess Spoken Wikipedia corpus before running ctc-segmentation. The input folder consists of subfolders with following stricture ├──   │   ├── aligned.swc   │   ├── audiometa.txt   │   ├── audio.ogg   │   ├── info.json   │   ├── wiki.html   │   ├── wiki.txt   │   └── wiki.xml ## The destination folder will contain look enumerated .ogg and .txt files like this: ├── audio | ├── 1.ogg | ├── 2.ogg | ... └── text ├── 1.txt ├── 2.txt ... """ import argparse import os import re parser = argparse.ArgumentParser() parser.add_argument( "--input_folder", required=True, type=str, help="Input folder in which each subfolder contains an article" ) parser.add_argument( "--destination_folder", required=True, type=str, help="Destination folder with audio and text subfolder" ) args = parser.parse_args() def replace_diacritics(text): text = re.sub(r"[éèëēêęěė]", "e", text) text = re.sub(r"[ãâāáäăâàąåạả]", "a", text) text = re.sub(r"[úūüùưûů]", "u", text) text = re.sub(r"[ôōóöõòő]", "o", text) text = re.sub(r"[ćçč]", "c", text) text = re.sub(r"[ïīíîıì]", "i", text) text = re.sub(r"[ñńňņ]", "n", text) text = re.sub(r"[țť]", "t", text) text = re.sub(r"[łľ]", "l", text) text = re.sub(r"[żžź]", "z", text) text = re.sub(r"[ğ]", "g", text) text = re.sub(r"[ř]", "r", text) text = re.sub(r"[ý]", "y", text) text = re.sub(r"[æ]", "ae", text) text = re.sub(r"[œ]", "oe", text) text = re.sub(r"[șşšś]", "s", text) return text def get_audio(name, n): """ Copies .ogg file. If there are several .ogg files, concatenates them. Args: name - name of folder within Spoken Wikipedia n - integer that will serve as output file name, e.g. if n=1, file 1.ogg will be created """ audio_path = os.path.join(args.input_folder, name, "audio.ogg") if not os.path.exists(audio_path): ## Some folders have multiple .ogg files, so we need to first combine them into one file. Example: ## |── Universe ##  │   ├── aligned.swc ##  │   ├── audio1.ogg ##  │   ├── audio2.ogg ##  │   ├── audio3.ogg ##  │   ├── audio4.ogg ##  │   ├── audiometa.txt ##  │   ├── info.json ##  │   ├── wiki.html ##  │   ├── wiki.txt ##  │   └── wiki.xml multiple_ogg_files = [] for i in range(1, 5): path = os.path.join(args.input_folder, name, "audio" + str(i) + ".ogg") if os.path.exists(path): multiple_ogg_files.append(path) else: break if len(multiple_ogg_files) == 0: return elif len(multiple_ogg_files) == 1: os.system("cp \"" + multiple_ogg_files[0] + "\" \"" + audio_path + "\"") else: tmp_file_name = "ffmeg_inputs.txt" print("tmp_file_name=", tmp_file_name) with open(tmp_file_name, "w", encoding="utf-8") as tmp_file: for path in multiple_ogg_files: tmp_file.write("file '" + path + "'\n") cmd = "ffmpeg -f concat -i \"" + tmp_file_name + "\" -c copy \"" + audio_path + "\"" print(cmd) os.system(cmd) output_audio_path = args.destination_folder + "/audio/" + str(n) + ".ogg" os.system("cp \"" + audio_path + "\" " + output_audio_path) def get_text(name, n): """ Cleans wiki.txt. Args: name - name of folder within Spoken Wikipedia n - integer that will serve as output file name, e.g. if n=1, file 1.txt will be created """ # Then we need to clean the text out_text = open(args.destination_folder + "/text/" + str(n) + ".txt", "w", encoding="utf-8") with open(args.input_folder + "/" + name + "/wiki.txt", "r", encoding="utf-8") as f: for line in f: do_break = False line2 = line.strip() ref_parts = line2.split("[^<>]+", r"", s) s = re.sub(r"", r"", s) # can be inside s = re.sub(r".+", r"", s) s = re.sub(r"", r"", s) s = re.sub(r"]+/>", r"", s) s = re.sub(r"<[^ <>]+>", r"", s) # , , if ( re.match(r"== *Notes *==", s) or re.match(r"== *References *==", s) or re.match(r"== *External links *==", s) or re.match(r"== *See also *==", s) ): do_break = True break s = re.sub(r"{{convert\|(\d+)\|(\w+)\|[^}]+}}", r"\g<1> \g<2>", s) # {{convert|7600|lb|kg}} s = re.sub(r"{{cquote\|", r"", s) s = re.sub(r"{{[^{}]+}}", r"", s) s = s.replace("{{", "").replace("}}", "") s = re.sub(r"(lang[^()]+)", r"", s) # (lang-bn...) s = re.sub(r"==+", r"", s) s = re.sub(r"''+", r" ", s) # remove multiple quotes s = re.sub(r" '", r" ", s) # remove quote at the beginning s = re.sub(r"' ", r" ", s) # remove quote at the end s = re.sub(r"[…\*]", r" ", s) s = re.sub(r"\\u....", r" ", s) # remove unicode s = re.sub(r"&[^ ;&]+;", r"", s) #   — s = replace_diacritics(s) s = re.sub(r"\[\[[^\]]+\|([^\]]+)\]\]", r"\g<1>", s) # if several variants, take the last one s = re.sub(r"\[\[([^\]]+)\]\]", r"\g<1>", s) out_text.write(s + "\n") if do_break: break out_text.close() if __name__ == "__main__": n = 0 for name in os.listdir(args.input_folder): n += 1 if not os.path.exists(args.input_folder + "/" + name + "/wiki.txt"): print("wiki.txt does not exist in " + name) continue get_audio(name, n) get_text(name, n)