| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						import argparse | 
					
					
						
						| 
							 | 
						import glob | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						import logging | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import subprocess | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import librosa | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						parser = argparse.ArgumentParser(description="AN4 dataset download and processing") | 
					
					
						
						| 
							 | 
						parser.add_argument("--data_root", required=True, default=None, type=str) | 
					
					
						
						| 
							 | 
						args = parser.parse_args() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def build_manifest(data_root, transcripts_path, manifest_path, wav_path): | 
					
					
						
						| 
							 | 
						    with open(transcripts_path, 'r') as fin: | 
					
					
						
						| 
							 | 
						        with open(manifest_path, 'w') as fout: | 
					
					
						
						| 
							 | 
						            for line in fin: | 
					
					
						
						| 
							 | 
						                 | 
					
					
						
						| 
							 | 
						                 | 
					
					
						
						| 
							 | 
						                transcript = line[: line.find('(') - 1].lower() | 
					
					
						
						| 
							 | 
						                transcript = transcript.replace('<s>', '').replace('</s>', '') | 
					
					
						
						| 
							 | 
						                transcript = transcript.strip() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                file_id = line[line.find('(') + 1 : -2]   | 
					
					
						
						| 
							 | 
						                audio_path = os.path.join( | 
					
					
						
						| 
							 | 
						                    data_root, wav_path, file_id[file_id.find('-') + 1 : file_id.rfind('-')], file_id + '.wav', | 
					
					
						
						| 
							 | 
						                ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                duration = librosa.core.get_duration(filename=audio_path) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                 | 
					
					
						
						| 
							 | 
						                metadata = { | 
					
					
						
						| 
							 | 
						                    "audio_filepath": audio_path, | 
					
					
						
						| 
							 | 
						                    "duration": duration, | 
					
					
						
						| 
							 | 
						                    "text": transcript, | 
					
					
						
						| 
							 | 
						                } | 
					
					
						
						| 
							 | 
						                json.dump(metadata, fout) | 
					
					
						
						| 
							 | 
						                fout.write('\n') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def main(): | 
					
					
						
						| 
							 | 
						    data_root = os.path.abspath(args.data_root) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    logging.info("Converting audio files to .wav...") | 
					
					
						
						| 
							 | 
						    sph_list = glob.glob(os.path.join(data_root, 'an4/**/*.sph'), recursive=True) | 
					
					
						
						| 
							 | 
						    for sph_path in sph_list: | 
					
					
						
						| 
							 | 
						        wav_path = sph_path[:-4] + '.wav' | 
					
					
						
						| 
							 | 
						        cmd = ['sox', sph_path, wav_path] | 
					
					
						
						| 
							 | 
						        subprocess.run(cmd) | 
					
					
						
						| 
							 | 
						    logging.info("Finished conversion.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    logging.info("Building training manifest...") | 
					
					
						
						| 
							 | 
						    train_transcripts = os.path.join(data_root, 'an4/etc/an4_train.transcription') | 
					
					
						
						| 
							 | 
						    train_manifest = os.path.join(data_root, 'an4/train_manifest.json') | 
					
					
						
						| 
							 | 
						    train_wavs = os.path.join(data_root, 'an4/wav/an4_clstk') | 
					
					
						
						| 
							 | 
						    build_manifest(data_root, train_transcripts, train_manifest, train_wavs) | 
					
					
						
						| 
							 | 
						    logging.info("Training manifests created.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    logging.info("Building test manifest...") | 
					
					
						
						| 
							 | 
						    test_transcripts = os.path.join(data_root, 'an4/etc/an4_test.transcription') | 
					
					
						
						| 
							 | 
						    test_manifest = os.path.join(data_root, 'an4/test_manifest.json') | 
					
					
						
						| 
							 | 
						    test_wavs = os.path.join(data_root, 'an4/wav/an4test_clstk') | 
					
					
						
						| 
							 | 
						    build_manifest(data_root, test_transcripts, test_manifest, test_wavs) | 
					
					
						
						| 
							 | 
						    logging.info("Test manifest created.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    logging.info("Done with AN4 processing!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if __name__ == '__main__': | 
					
					
						
						| 
							 | 
						    main() | 
					
					
						
						| 
							 | 
						
 |