File size: 7,444 Bytes

8bdc19f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a84125d0-ae92-4829-976d-456491ac7c0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] =  \"0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3179777-b19d-46fb-8eb5-c5a4dd175f68",
   "metadata": {},
   "outputs": [],
   "source": [
    "from vinorm import TTSnorm\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9427640-6c1e-4337-9230-30838b508ea8",
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "189956af-477b-4756-88e6-e3002a8ed039",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from f5tts_wrapper import F5TTSWrapper\n",
    "\n",
    "# Initialize the F5-TTS wrapper\n",
    "tts = F5TTSWrapper(\n",
    "    vocoder_name=\"vocos\",\n",
    "    # point to  EraX model checkpoint downloaded from HuggingFace\n",
    "    ckpt_path=\"path to model_last.pt\",\n",
    "    vocab_file=\"vocab.txt\",\n",
    "    use_ema=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ccf4419-a9b7-4dcb-9ce0-5ab3b948de61",
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_audio_path = \"update_213000_ref.wav\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "240415b2-5e32-4caa-b833-ceae50675ca3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(ref_audio_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6bde9c39-d8fa-454a-9fcc-418175d8a989",
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_text =  \"Thậm chí không ăn thì cũng có cảm giác rất là cứng bụng, chủ yếu là cái phần rốn...trở lên. Em có cảm giác khó thở, và ngủ cũng không ngon, thường bị ợ hơi rất là nhiều\"\n",
    "ref_text = TTSnorm(ref_text)\n",
    "ref_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1587e309-d118-40f1-91ee-a949a28eb135",
   "metadata": {},
   "outputs": [],
   "source": [
    "text =  \"Trong khi đó, tại một chung cư trên địa bàn P.Vĩnh Tuy (Q.Hoàng Mai), nhiều người sống trên tầng cao giật mình khi thấy rung lắc mạnh nên đã chạy xuống sảnh tầng 1. Cư dân tại đây cho biết, họ chưa bao giờ cảm thấy ảnh hưởng của động đất mạnh như hôm nay.\"\n",
    "text_norm =  TTSnorm(text)\n",
    "text_norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "918bf198-1ec7-4236-87bd-3be6210ea011",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Path to reference audio and output directory\n",
    "output_dir = \"output\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "# Preprocess the reference audio just once\n",
    "# You can provide a reference text, or leave it blank for auto-transcription\n",
    "tts.preprocess_reference(\n",
    "    ref_audio_path=ref_audio_path,\n",
    "    ref_text=ref_text,  # Leave empty for auto-transcription\n",
    "    clip_short=True  # Clip long audio to ~12 seconds max\n",
    ")\n",
    "\n",
    "print(f\"Reference audio duration: {tts.get_current_audio_length():.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23aa55ad-612f-42f9-9742-ce0ce50a054a",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences =  [\n",
    "    gen_text\n",
    "]\n",
    "for i, sentence in enumerate(sentences):\n",
    "    output_path = os.path.join(output_dir, f\"generated_non_ema_{i+1}.wav\")\n",
    "\n",
    "    sentence_normed  = TTSnorm(sentence)\n",
    "    \n",
    "    # Generate the audio\n",
    "    tts.generate(\n",
    "        text=sentence_normed,\n",
    "        output_path=output_path,\n",
    "        nfe_step=20,               # Number of denoising steps (32)\n",
    "        cfg_strength=2.0,          # Classifier-free guidance strength (2)\n",
    "        speed=1.0,                 # Speed of generated speech (1.0)\n",
    "        cross_fade_duration=0.15,  # Cross-fade between chunks (0.15)\n",
    "    )\n",
    "    \n",
    "    print(f\"Generated: {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a3cb66e-1343-43cd-bed7-84254a74262f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(f\"{output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "832f3426-7a2a-4be9-aaaf-630866ecb31b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# For applications needing the raw audio data rather than saved files:\n",
    "audio_array, sample_rate = tts.generate(\n",
    "    text=gen_text,\n",
    "    return_numpy=True,\n",
    "    nfe_step=20,               # Number of denoising steps (32)\n",
    "    cfg_strength=2.0,          # Classifier-free guidance strength (2)\n",
    "    speed=1.0,                 # Speed of generated speech (1.0)\n",
    "    cross_fade_duration=0.15,  # Cross-fade between chunks (0.15)\n",
    ")\n",
    "print(f\"Generated audio array with shape {audio_array.shape} and sample rate {sample_rate}\")\n",
    "\n",
    "# Get audio and spectrogram (useful for visualization)\n",
    "audio_array, sample_rate, spectrogram = tts.generate(\n",
    "    text=\"This returns both audio and its spectrogram.\",\n",
    "    return_numpy=True,\n",
    "    return_spectrogram=True,\n",
    "    nfe_step=20,               # Number of denoising steps (32)\n",
    "    cfg_strength=2.0,          # Classifier-free guidance strength (2)\n",
    "    speed=1.0,                 # Speed of generated speech (1.0)\n",
    "    cross_fade_duration=0.15,  # Cross-fade between chunks (0.15)\n",
    ")\n",
    "print(f\"Generated spectrogram with shape {spectrogram.shape}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "862de193-96b3-4a37-9326-697838168437",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def save_spectrogram(spectrogram):\n",
    "    plt.figure(figsize=(12, 4))\n",
    "    plt.imshow(spectrogram, origin=\"lower\", aspect=\"auto\")\n",
    "    plt.colorbar()\n",
    "    plt.show()\n",
    "    plt.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb1979e7-5645-4cb0-94d8-aacc59109b03",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_spectrogram(spectrogram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87824cb4-7721-4d46-948a-a3457869b679",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}