Update README.md

f0eab92 verified 4 days ago

No virus

9.73 kB

	---
	license: mit
	language:
	- en
	tags:
	- audio
	- text-to-speech
	- matcha-tts
	---
	# Matcha-TTS CommonVoice EN001
	[you can test variation models](https://huggingface.co/spaces/Akjava/matcha-tts-onnx-benchmarks) \| [Github Demo](https://akjava.github.io/Matcha-TTS-Japanese/matcha_tts_speak_en001.html)

	## Source Audio
	https://commonvoice.mozilla.org/en/datasets
	Common Voice Corpus 1

	I called audios 42da7f26(head-audio-id)_290(files) EN001
	(No plan to include audios in this repo)
	## Any Good point?
	LJSpeech is much better quality,but it's female voice.This one is men.

	VCTK 109 voices are similar quality,but that is ODC-By License.

	This audio is just under MIT more easy to continue training or something.

	however I recommend you use VCTK,ODC-By License is not so problem.I'm going to create new voices with this in future.
	## How to Train
	Train with IPA text(this folk)
	https://github.com/akjava/Matcha-TTS-Japanese

	check this repo's config files.
	however there are no audio copy tools.TODO later

	## Files Info
	### checkpoints
	Matcha-TTS checkpoint - epoch seems big but train with only 290 audios

	Sadly I lost between 3599 - 4499 checkpoints.I'm sorry.

	As I see Training metrics.
	6399 seems overfitting,however my english listening skill is poor and I cant evaluate it.

	### ONNX

	[github codes](https://github.com/akjava/Matcha-TTS-Japanese/tree/main/examples) - see sourcecode
	[github Page](https://akjava.github.io/Matcha-TTS-Japanese/) - Test Onnx Example

	onnx simplified loading speed is now 1.5 times faster.
	```
	from onnxsim import simplify
	import onnx

	model = onnx.load("en001_6399_T2.onnx")
	model_simp, check = simplify(model)

	onnx.save(model_simp, "en001_6399_T2_simplify.onnx")
	```

	timesteps is default(5) ,small time steps ;The infer speed is somewhat faster, but the quality is lower.

	If you need original onnx do like official way
	```
	python -m matcha.onnx.export checkpoint_epoch=5699.ckpt en001_5699t2.onnx --vocoder-name hifigan_T2_v1 --n-timesteps 5 --vocoder-checkpoint generator_v1
	python -m matcha.onnx.export checkpoint_epoch=5699.ckpt en001_5699.onnx --vocoder-name hifigan_univ_v1 --n-timesteps 5 --vocoder-checkpoint g_02500000
	```

	- T2 means Vocoder is hifigan_T2_v1
	- Unif means Voder is hifigan_univ_v1

	you can quantize this onnx,but 3 times smaller, but 4-5 times slower,that why I did't include that.
	```
	from onnxruntime.quantization import quantize_dynamic, QuantType
	quantized_model = quantize_dynamic(src_model_path, dst_model_path, weight_type=QuantType.QUInt8)
	```


	To use onnx need something,below is old sample
	```
	const _pad = "_";
	const _punctuation = ";:,.!?¡¿—…\"«»“” ";
	const _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
	const _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";

	// below code called Spread syntax
	const Symbols = [_pad, ..._punctuation, ..._letters, ..._letters_ipa];

	const SpaceId = Symbols.indexOf(' ');

	const symbolToId = {};
	const idToSymbol = {};

	// initialize symbolToId and idToSymbol
	for (let i = 0; i < Symbols.length; i++) {
	symbolToId[Symbols[i]] = i;
	idToSymbol[i] = Symbols[i];
	}

	class MatchaOnnx {
	constructor() {
	}
	async load_model(model_path,options={}){
	this.session = await ort.InferenceSession.create(model_path,options);
	}

	get_output_names_html(){
	if (typeof this.session=='undefined'){
	return null
	}
	let outputNamesString = '[outputs]<br>';
	const outputNames = this.session.outputNames;
	for (let outputName of outputNames) {
	console.log(outputName)
	outputNamesString+=outputName+"<br>"
	}
	return outputNamesString.trim()
	}

	get_input_names_html(){
	if (typeof this.session=='undefined'){
	return null
	}

	let inputNamesString = '[Inputs]<br>';
	const inputNames = this.session.inputNames;

	for (let inputName of inputNames) {
	console.log(inputName)
	inputNamesString+=inputName+"<br>"
	}
	return inputNamesString.trim()
	}


	processText(text) {
	const x = this.intersperse(this.textToSequence(text));
	const x_phones = this.sequenceToText(x);
	const textList = [];
	for (let i = 1; i < x_phones.length; i += 2) {
	textList.push(x_phones[i]);
	}

	return {
	x: x,
	x_length: x.length,
	x_phones: x_phones,
	x_phones_label: textList.join(""),
	};
	}


	basicCleaners2(text, lowercase = false) {
	if (lowercase) {
	text = text.toLowerCase();
	}
	text = text.replace(/\s+/g, " ");
	return text;
	}

	textToSequence(text) {
	const sequenceList = [];
	const clean_text = this.basicCleaners2(text);
	for (let i = 0; i < clean_text.length; i++) {
	const symbol = clean_text[i];
	sequenceList.push(symbolToId[symbol]);
	}
	return sequenceList;
	}

	intersperse(sequence, item = 0) {
	const sequenceList = [item];
	for (let i = 0; i < sequence.length; i++) {
	sequenceList.push(sequence[i]);
	sequenceList.push(item);
	}
	return sequenceList;
	}

	sequenceToText(sequence) {
	const textList = [];
	for (let i = 0; i < sequence.length; i++) {
	const symbol = idToSymbol[sequence[i]];
	textList.push(symbol);
	}
	return textList.join("");
	}

	async infer(text, temperature, speed) {
	console.log(this.session)
	const dic = this.processText(text);
	console.log(`x:${dic.x.join(", ")}`);
	console.log(`x_length:${dic.x_length}`);
	console.log(`x_phones_label:${dic.x_phones_label}`);

	// Prepare input tensors (assuming your ONNX Runtime library uses similar syntax)
	//const x_tensor = new this.session.Tensor('long', dic.x, [1, dic.x.length]);
	//const x_length_tensor = new this.session.Tensor('long', [dic.x.length], [1]);
	//const scales_tensor = new this.session.Tensor('float', [temperature, speed], [2]);

	const dataX = new BigInt64Array(dic.x.length)
	for (let i = 0; i < dic.x.length; i++) {
	//console.log(dic.x[i])
	dataX[i] = BigInt(dic.x[i]); // Convert each number to a BigInt
	}
	const data_x_length = new BigInt64Array(1)
	data_x_length[0] = BigInt(dic.x_length)

	//const dataX = Int32Array.from([dic.x_length])
	const tensorX = new ort.Tensor('int64', dataX, [1, dic.x.length]);
	// const data_x_length = Int32Array.from([dic.x_length])
	const tensor_x_length = new ort.Tensor('int64', data_x_length, [1]);
	const data_scale = Float32Array.from( [temperature, speed])
	const tensor_scale = new ort.Tensor('float32', data_scale, [2]);


	// Run inference
	const output = await this.session.run({
	x: tensorX,
	x_lengths: tensor_x_length,
	scales: tensor_scale,
	});
	console.log(output)
	// Extract output (assuming your ONNX Runtime library uses similar syntax)
	const wav_array = output.wav.data;
	console.log(wav_array[0]);
	console.log(wav_array.length);

	const x_lengths_array = output.wav_lengths.data;
	console.log(x_lengths_array.join(", "));

	return wav_array;
	}


	}
	```
	convert to wav
	```


	function webWavPlay(f32array){
	blob = float32ArrayToWav(f32array)
	url = createObjectUrlFromBlob(blob)
	console.log(url)
	playAudioFromUrl(url)
	}

	function createObjectUrlFromBlob(blob) {
	const url = URL.createObjectURL(blob);
	return url;
	}

	function playAudioFromUrl(url) {
	const audio = new Audio(url);
	audio.play().catch(error => console.error('Failed to play audio:', error));
	}


	//I copied
	//https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de/blob/main/app-tts.js
	// this function is copied/modified from
	// https://gist.github.com/meziantou/edb7217fddfbb70e899e
	function float32ArrayToWav(floatSamples, sampleRate=22050) {
	let samples = new Int16Array(floatSamples.length);
	for (let i = 0; i < samples.length; ++i) {
	let s = floatSamples[i];
	if (s >= 1)
	s = 1;
	else if (s <= -1)
	s = -1;

	samples[i] = s * 32767;
	}

	let buf = new ArrayBuffer(44 + samples.length * 2);
	var view = new DataView(buf);

	// http://soundfile.sapp.org/doc/WaveFormat/
	// F F I R
	view.setUint32(0, 0x46464952, true); // chunkID
	view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
	// E V A W
	view.setUint32(8, 0x45564157, true); // format
	//
	// t m f
	view.setUint32(12, 0x20746d66, true); // subchunk1ID
	view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
	view.setUint32(20, 1, true); // audioFormat, 1 for PCM
	view.setUint16(22, 1, true); // numChannels: 1 channel
	view.setUint32(24, sampleRate, true); // sampleRate
	view.setUint32(28, sampleRate * 2, true); // byteRate
	view.setUint16(32, 2, true); // blockAlign
	view.setUint16(34, 16, true); // bitsPerSample
	view.setUint32(36, 0x61746164, true); // Subchunk2ID
	view.setUint32(40, samples.length * 2, true); // subchunk2Size

	let offset = 44;
	for (let i = 0; i < samples.length; ++i) {
	view.setInt16(offset, samples[i], true);
	offset += 2;
	}

	return new Blob([view], {type: 'audio/wav'});
	}
	```
	### Audio
	I cut with VAD tools and denoise with resemble-enhance