| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						import argparse | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import subprocess | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						parser = argparse.ArgumentParser(description="Processing Aishell2 Data") | 
					
					
						
						| 
							 | 
						parser.add_argument("--audio_folder", default=None, type=str, required=True, help="Audio (wav) data directory.") | 
					
					
						
						| 
							 | 
						parser.add_argument("--dest_folder", default=None, type=str, required=True, help="Destination directory.") | 
					
					
						
						| 
							 | 
						args = parser.parse_args() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def __process_data(data_folder: str, dst_folder: str): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    To generate manifest | 
					
					
						
						| 
							 | 
						    Args: | 
					
					
						
						| 
							 | 
						        data_folder: source with wav files | 
					
					
						
						| 
							 | 
						        dst_folder: where manifest files will be stored | 
					
					
						
						| 
							 | 
						    Returns: | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    if not os.path.exists(dst_folder): | 
					
					
						
						| 
							 | 
						        os.makedirs(dst_folder) | 
					
					
						
						| 
							 | 
						    data_type = ['dev', 'test', 'train'] | 
					
					
						
						| 
							 | 
						    for data in data_type: | 
					
					
						
						| 
							 | 
						        dst_file = os.path.join(dst_folder, data + ".json") | 
					
					
						
						| 
							 | 
						        uttrances = [] | 
					
					
						
						| 
							 | 
						        wav_dir = os.path.join(data_folder, "wav", data) | 
					
					
						
						| 
							 | 
						        transcript_file = os.path.join(data_folder, "transcript", data, "trans.txt") | 
					
					
						
						| 
							 | 
						        trans_text = {} | 
					
					
						
						| 
							 | 
						        with open(transcript_file, "r", encoding='utf-8') as f: | 
					
					
						
						| 
							 | 
						            for line in f: | 
					
					
						
						| 
							 | 
						                line = line.strip().split() | 
					
					
						
						| 
							 | 
						                utterance_id, text = line[0], " ".join(line[1:]) | 
					
					
						
						| 
							 | 
						                trans_text[utterance_id] = text.upper() | 
					
					
						
						| 
							 | 
						        session_list = os.listdir(wav_dir) | 
					
					
						
						| 
							 | 
						        for sessions in session_list: | 
					
					
						
						| 
							 | 
						            cur_dir = os.path.join(wav_dir, sessions) | 
					
					
						
						| 
							 | 
						            for wavs in os.listdir(cur_dir): | 
					
					
						
						| 
							 | 
						                audio_id = wavs.strip(".wav") | 
					
					
						
						| 
							 | 
						                audio_filepath = os.path.abspath(os.path.join(cur_dir, wavs)) | 
					
					
						
						| 
							 | 
						                duration = subprocess.check_output('soxi -D {0}'.format(audio_filepath), shell=True) | 
					
					
						
						| 
							 | 
						                duration = float(duration) | 
					
					
						
						| 
							 | 
						                text = trans_text[audio_id] | 
					
					
						
						| 
							 | 
						                uttrances.append( | 
					
					
						
						| 
							 | 
						                    json.dumps( | 
					
					
						
						| 
							 | 
						                        {"audio_filepath": audio_filepath, "duration": duration, "text": text}, ensure_ascii=False | 
					
					
						
						| 
							 | 
						                    ) | 
					
					
						
						| 
							 | 
						                ) | 
					
					
						
						| 
							 | 
						        with open(dst_file, "w") as f: | 
					
					
						
						| 
							 | 
						            for line in uttrances: | 
					
					
						
						| 
							 | 
						                f.write(line + "\n") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def __get_vocab(data_folder: str, des_dir: str): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    To generate the vocabulary file | 
					
					
						
						| 
							 | 
						    Args: | 
					
					
						
						| 
							 | 
						        data_folder: source with the transcript file | 
					
					
						
						| 
							 | 
						        dst_folder: where the file will be stored | 
					
					
						
						| 
							 | 
						    Returns: | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    if not os.path.exists(des_dir): | 
					
					
						
						| 
							 | 
						        os.makedirs(des_dir) | 
					
					
						
						| 
							 | 
						    trans_file = os.path.join(data_folder, "transcript", "train", "trans.txt") | 
					
					
						
						| 
							 | 
						    vocab_dict = {} | 
					
					
						
						| 
							 | 
						    with open(trans_file, "r", encoding='utf-8') as f: | 
					
					
						
						| 
							 | 
						        for line in f: | 
					
					
						
						| 
							 | 
						            line = line.strip().split() | 
					
					
						
						| 
							 | 
						            text = " ".join(line[1:]) | 
					
					
						
						| 
							 | 
						            for i in text.upper(): | 
					
					
						
						| 
							 | 
						                if i in vocab_dict: | 
					
					
						
						| 
							 | 
						                    vocab_dict[i] += 1 | 
					
					
						
						| 
							 | 
						                else: | 
					
					
						
						| 
							 | 
						                    vocab_dict[i] = 1 | 
					
					
						
						| 
							 | 
						    vocab_dict = sorted(vocab_dict.items(), key=lambda k: k[1], reverse=True) | 
					
					
						
						| 
							 | 
						    vocab = os.path.join(des_dir, "vocab.txt") | 
					
					
						
						| 
							 | 
						    vocab = open(vocab, "w", encoding='utf-8') | 
					
					
						
						| 
							 | 
						    for k in vocab_dict: | 
					
					
						
						| 
							 | 
						        vocab.write(k[0] + "\n") | 
					
					
						
						| 
							 | 
						    vocab.close() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def main(): | 
					
					
						
						| 
							 | 
						    source_data = args.audio_folder | 
					
					
						
						| 
							 | 
						    des_dir = args.dest_folder | 
					
					
						
						| 
							 | 
						    print("begin to process data...") | 
					
					
						
						| 
							 | 
						    __process_data(source_data, des_dir) | 
					
					
						
						| 
							 | 
						    __get_vocab(source_data, des_dir) | 
					
					
						
						| 
							 | 
						    print("finish all!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if __name__ == "__main__": | 
					
					
						
						| 
							 | 
						    main() | 
					
					
						
						| 
							 | 
						
 |