Real-Time-Voice-Cloning

Runtime error

App Files Files Community

Ahsen Khaliq commited on Sep 30, 2021

Commit

c8546f3

1 Parent(s): 24829a1

Update demo_cli.py

Browse files

Files changed (1) hide show

demo_cli.py +86 -85

demo_cli.py CHANGED Viewed

@@ -36,6 +36,10 @@ if __name__ == '__main__':
         "Optional random number seed value to make toolbox deterministic.")
     parser.add_argument("--no_mp3_support", action="store_true", help=\
         "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
@@ -132,94 +136,91 @@ if __name__ == '__main__':
           "an explanation of what is happening.\n")
     print("Interactive generation loop")
-    num_generated = 0
-    while True:
-        try:
-            # Get the reference audio filepath
-            message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
-                      "wav, m4a, flac, ...):\n"
-            in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
-            if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
-                print("Can't Use mp3 files please try again:")
-                continue
-            ## Computing the embedding
-            # First, we load the wav using the function that the speaker encoder provides. This is
-            # important: there is preprocessing that must be applied.
-            # The following two methods are equivalent:
-            # - Directly load from the filepath:
-            preprocessed_wav = encoder.preprocess_wav(in_fpath)
-            # - If the wav is already loaded:
-            original_wav, sampling_rate = librosa.load(str(in_fpath))
-            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
-            print("Loaded file succesfully")
-            # Then we derive the embedding. There are many functions and parameters that the
-            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
-            # only use this function (with its default parameters):
-            embed = encoder.embed_utterance(preprocessed_wav)
-            print("Created the embedding")
-            ## Generating the spectrogram
-            text = input("Write a sentence (+-20 words) to be synthesized:\n")
-            # If seed is specified, reset torch seed and force synthesizer reload
-            if args.seed is not None:
-                torch.manual_seed(args.seed)
-                synthesizer = Synthesizer(args.syn_model_fpath)
-            # The synthesizer works in batch, so you need to put your data in a list or numpy array
-            texts = [text]
-            embeds = [embed]
-            # If you know what the attention layer alignments are, you can retrieve them here by
-            # passing return_alignments=True
-            specs = synthesizer.synthesize_spectrograms(texts, embeds)
-            spec = specs[0]
-            print("Created the mel spectrogram")
-            ## Generating the waveform
-            print("Synthesizing the waveform:")
-            # If seed is specified, reset torch seed and reload vocoder
-            if args.seed is not None:
-                torch.manual_seed(args.seed)
-                vocoder.load_model(args.voc_model_fpath)
-            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
-            # spectrogram, the more time-efficient the vocoder.
-            generated_wav = vocoder.infer_waveform(spec)
-            ## Post-generation
-            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
-            # pad it.
-            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
-            # Trim excess silences to compensate for gaps in spectrograms (issue #53)
-            generated_wav = encoder.preprocess_wav(generated_wav)
-            # Play the audio (non-blocking)
-            if not args.no_sound:
-                try:
-                    sd.stop()
-                    sd.play(generated_wav, synthesizer.sample_rate)
-                except sd.PortAudioError as e:
-                    print("\nCaught exception: %s" % repr(e))
-                    print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
-                except:
-                    raise
-            # Save it on the disk
-            filename = "demo_output_%02d.wav" % num_generated
-            print(generated_wav.dtype)
-            sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
-            num_generated += 1
-            print("\nSaved output as %s\n\n" % filename)
-        except Exception as e:
-            print("Caught exception: %s" % repr(e))
-            print("Restarting\n")

         "Optional random number seed value to make toolbox deterministic.")
     parser.add_argument("--no_mp3_support", action="store_true", help=\
         "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
+    parser.add_argument("-audio", "--audio_path", type=Path, required = True,
+                        help="Path to a audio file")
+    parser.add_argument("--text", type=str, required = True, help=\
+        "Text Input")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
           "an explanation of what is happening.\n")
     print("Interactive generation loop")
+    # while True:
+    try:
+        # Get the reference audio filepath
+        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
+                    "wav, m4a, flac, ...):\n"
+        in_fpath = args.audio_path
+        if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
+            print("Can't Use mp3 files please try again:")
+        ## Computing the embedding
+        # First, we load the wav using the function that the speaker encoder provides. This is
+        # important: there is preprocessing that must be applied.
+        # The following two methods are equivalent:
+        # - Directly load from the filepath:
+        preprocessed_wav = encoder.preprocess_wav(in_fpath)
+        # - If the wav is already loaded:
+        original_wav, sampling_rate = librosa.load(str(in_fpath))
+        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
+        print("Loaded file succesfully")
+        # Then we derive the embedding. There are many functions and parameters that the
+        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+        # only use this function (with its default parameters):
+        embed = encoder.embed_utterance(preprocessed_wav)
+        print("Created the embedding")
+        ## Generating the spectrogram
+        text = args.text
+        # If seed is specified, reset torch seed and force synthesizer reload
+        if args.seed is not None:
+            torch.manual_seed(args.seed)
+            synthesizer = Synthesizer(args.syn_model_fpath)
+        # The synthesizer works in batch, so you need to put your data in a list or numpy array
+        texts = [text]
+        embeds = [embed]
+        # If you know what the attention layer alignments are, you can retrieve them here by
+        # passing return_alignments=True
+        specs = synthesizer.synthesize_spectrograms(texts, embeds)
+        spec = specs[0]
+        print("Created the mel spectrogram")
+        ## Generating the waveform
+        print("Synthesizing the waveform:")
+        # If seed is specified, reset torch seed and reload vocoder
+        if args.seed is not None:
+            torch.manual_seed(args.seed)
+            vocoder.load_model(args.voc_model_fpath)
+        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+        # spectrogram, the more time-efficient the vocoder.
+        generated_wav = vocoder.infer_waveform(spec)
+        ## Post-generation
+        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+        # pad it.
+        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
+        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+        generated_wav = encoder.preprocess_wav(generated_wav)
+        # Play the audio (non-blocking)
+        if not args.no_sound:
+            try:
+                sd.stop()
+                sd.play(generated_wav, synthesizer.sample_rate)
+            except sd.PortAudioError as e:
+                print("\nCaught exception: %s" % repr(e))
+                print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
+            except:
+                raise
+        # Save it on the disk
+        filename = "demo_output_1.wav"
+        print(generated_wav.dtype)
+        sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
+        print("\nSaved output as %s\n\n" % filename)
+    except Exception as e:
+        print("Caught exception: %s" % repr(e))
+        print("Restarting\n")