Spaces:

alethanhson
/

csm-1b-gradio

Sleeping

App Files Files Community

alethanhson commited on Mar 17

Commit

e0668e2

1 Parent(s): 04817a7

fix

Browse files

Files changed (6) hide show

.github/workflows/sync-to-hub.yml +20 -0
.huggingface/space.yml +11 -0
Procfile +1 -0
README.md +32 -0
app.py +194 -119
requirements.txt +2 -2

.github/workflows/sync-to-hub.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://USER:[email protected]/spaces/USER/CSM-1B-GRADIO main

.huggingface/space.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+title: CSM-1B Gradio Demo
+emoji: 🔊
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+sdk_version: 4.19.2
+app_file: app.py
+pinned: false
+license: apache-2.0
+models:
+  - sesame/csm-1b

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: python app.py

README.md CHANGED Viewed

@@ -10,3 +10,35 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# CSM-1B Gradio Demo
+Ứng dụng demo cho mô hình CSM-1B (Conversational Speech Model) sử dụng Gradio để tạo giao diện người dùng thân thiện.
+## Tính năng
+- Chuyển đổi văn bản thành giọng nói tự nhiên
+- Hỗ trợ nhiều giọng đọc khác nhau (ID người nói)
+- Tạo giọng nói theo ngữ cảnh hội thoại
+- Tùy chỉnh các tham số như nhiệt độ và độ dài âm thanh
+## Sử dụng
+1. Nhập văn bản bạn muốn chuyển thành giọng nói
+2. Chọn ID người nói (từ 0-10)
+3. Tùy chỉnh các thông số nâng cao (không bắt buộc)
+4. Thêm ngữ cảnh hội thoại nếu cần
+5. Nhấn "Tạo âm thanh" để nghe kết quả
+## Triển khai trên Hugging Face Spaces
+Ứng dụng này được thiết kế để chạy trên Hugging Face Spaces. Để triển khai:
+1. Tạo một Space mới
+2. Upload mã nguồn lên Space
+3. Chọn Gradio là framework
+4. Chờ ứng dụng được xây dựng và khởi động
+## Tài nguyên
+Mô hình CSM-1B của Sesame AI: [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

app.py CHANGED Viewed

@@ -1,134 +1,209 @@
-# import base64
-# import io
-# import logging
-# from typing import List, Optional
-# import torch
-# import torchaudio
-# import uvicorn
-# from fastapi import FastAPI, HTTPException
-# from fastapi.middleware.cors import CORSMiddleware
-# from pydantic import BaseModel
-# from generator import load_csm_1b, Segment
-# import gradio as gr
-# logging.basicConfig(level=logging.INFO)
-# logger = logging.getLogger(__name__)
-# app = FastAPI(
-#     title="CSM 1B API",
-#     description="API for Sesame's Conversational Speech Model",
-#     version="1.0.0",
-# )
-# app.add_middleware(
-#     CORSMiddleware,
-#     allow_origins=["*"],
-#     allow_credentials=True,
-#     allow_methods=["*"],
-#     allow_headers=["*"],
-# )
-# generator = None
-# class SegmentRequest(BaseModel):
-#     speaker: int
-#     text: str
-#     audio_base64: Optional[str] = None
-# class GenerateAudioRequest(BaseModel):
-#     text: str
-#     speaker: int
-#     context: List[SegmentRequest] = []
-#     max_audio_length_ms: float = 10000
-#     temperature: float = 0.9
-#     topk: int = 50
-# class AudioResponse(BaseModel):
-#     audio_base64: str
-#     sample_rate: int
-# @app.on_event("startup")
-# async def startup_event():
-#     global generator
-#     logger.info("Loading CSM 1B model...")
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#     if device == "cpu":
-#         logger.info("Loading CSM 1B model...")
-#         logger.warning("GPU not available. Using CPU, performance may be slow!")
-#     logger.info(f"Using device: {device}")
-#     try:
-#         generator = load_csm_1b(device=device)
-#         logger.info(f"Model loaded successfully on device: {device}")
-#     except Exception as e:
-#         logger.error(f"Could not load model: {str(e)}")
-#         raise e
-# @app.post("/generate-audio", response_model=AudioResponse)
-# async def generate_audio(request: GenerateAudioRequest):
-#     global generator
-#     if generator is None:
-#         raise HTTPException(status_code=503, detail="Model not loaded. Please try again later.")
-#     try:
-#         context_segments = []
-#         for segment in request.context:
-#             if segment.audio_base64:
-#                 audio_bytes = base64.b64decode(segment.audio_base64)
-#                 audio_buffer = io.BytesIO(audio_bytes)
-#                 audio_tensor, sample_rate = torchaudio.load(audio_buffer)
-#                 audio_tensor = torchaudio.functional.resample(
-#                     audio_tensor.squeeze(0),
-#                     orig_freq=sample_rate,
-#                     new_freq=generator.sample_rate
-#                 )
-#             else:
-#                 audio_tensor = torch.zeros(0, dtype=torch.float32)
-#             context_segments.append(
-#                 Segment(text=segment.text, speaker=segment.speaker, audio=audio_tensor)
-#             )
-#         audio = generator.generate(
-#             text=request.text,
-#             speaker=request.speaker,
-#             context=context_segments,
-#             max_audio_length_ms=request.max_audio_length_ms,
-#             temperature=request.temperature,
-#             topk=request.topk,
-#         )
-#         buffer = io.BytesIO()
-#         torchaudio.save(buffer, audio.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
-#         # torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)
-#         buffer.seek(0)
-#         # audio_base64 = base64.b64encode(buffer.read()).decode("utf-8")
-#         return AudioResponse(
-#             content=buffer.read(),
-#             media_type="audio/wav",
-#             headers={"Content-Disposition": "attachment; filename=audio.wav"}
-#         )
-#     except Exception as e:
-#         logger.error(f"error when building audio: {str(e)}")
-#         raise HTTPException(status_code=500, detail=f"error when building audio: {str(e)}")
-# @app.get("/health")
-# async def health_check():
-#     if generator is None:
-#         return {"status": "not_ready", "message": "Model is loading"}
-#     return {"status": "ready", "message": "API is ready to serve"}
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import base64
+import io
+import logging
+from typing import List
+import torch
+import torchaudio
+import gradio as gr
+import numpy as np
+from generator import load_csm_1b, Segment
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+generator = None
+def initialize_model():
+    global generator
+    logger.info("Đang tải mô hình CSM 1B...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device == "cpu":
+        logger.warning("GPU không khả dụng. Sử dụng CPU, hiệu suất có thể chậm!")
+    logger.info(f"Sử dụng thiết bị: {device}")
+    try:
+        generator = load_csm_1b(device=device)
+        logger.info(f"Mô hình đã được tải thành công trên thiết bị: {device}")
+        return True
+    except Exception as e:
+        logger.error(f"Không thể tải mô hình: {str(e)}")
+        return False
+def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
+    global generator
+    if generator is None:
+        if not initialize_model():
+            return None, "Không thể tải mô hình. Vui lòng thử lại sau."
+    try:
+        # Xử lý context nếu được cung cấp
+        context_segments = []
+        if context_texts and context_speakers:
+            for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
+                if ctx_text and ctx_speaker is not None:
+                    context_segments.append(
+                        Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
+                    )
+        # Tạo âm thanh từ văn bản
+        audio = generator.generate(
+            text=text,
+            speaker=int(speaker_id),
+            context=context_segments,
+            max_audio_length_ms=float(max_audio_length_ms),
+            temperature=float(temperature),
+            topk=int(topk),
+        )
+        # Chuyển đổi tensor thành numpy array để Gradio có thể xử lý
+        audio_numpy = audio.cpu().numpy()
+        sample_rate = generator.sample_rate
+        return (sample_rate, audio_numpy), None
+    except Exception as e:
+        logger.error(f"Lỗi khi tạo âm thanh: {str(e)}")
+        return None, f"Lỗi khi tạo âm thanh: {str(e)}"
+def clear_context():
+    return [], []
+def add_context(text, speaker_id, context_texts, context_speakers):
+    if text and speaker_id is not None:
+        context_texts.append(text)
+        context_speakers.append(int(speaker_id))
+    return context_texts, context_speakers
+# Thiết lập giao diện Gradio
+with gr.Blocks(title="CSM 1B Demo") as demo:
+    gr.Markdown("# CSM 1B - Mô hình tạo giọng nói hội thoại")
+    gr.Markdown("Nhập văn bản để tạo giọng nói tự nhiên với mô hình CSM 1B")
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Văn bản để chuyển thành giọng nói",
+                placeholder="Nhập văn bản ở đây...",
+                lines=3
+            )
+            speaker_id = gr.Slider(
+                label="ID người nói",
+                minimum=0,
+                maximum=10,
+                step=1,
+                value=0
+            )
+            with gr.Accordion("Tùy chọn nâng cao", open=False):
+                max_length = gr.Slider(
+                    label="Độ dài tối đa (mili giây)",
+                    minimum=1000,
+                    maximum=30000,
+                    step=1000,
+                    value=10000
+                )
+                temp = gr.Slider(
+                    label="Nhiệt độ",
+                    minimum=0.1,
+                    maximum=1.5,
+                    step=0.1,
+                    value=0.9
+                )
+                top_k = gr.Slider(
+                    label="Top K",
+                    minimum=10,
+                    maximum=100,
+                    step=10,
+                    value=50
+                )
+            with gr.Accordion("Ngữ cảnh hội thoại", open=False):
+                context_list = gr.State([])
+                context_speakers_list = gr.State([])
+                with gr.Row():
+                    context_text = gr.Textbox(label="Văn bản ngữ cảnh", lines=2)
+                    context_speaker = gr.Slider(
+                        label="ID người nói ngữ cảnh",
+                        minimum=0,
+                        maximum=10,
+                        step=1,
+                        value=0
+                    )
+                with gr.Row():
+                    add_ctx_btn = gr.Button("Thêm ngữ cảnh")
+                    clear_ctx_btn = gr.Button("Xóa tất cả ngữ cảnh")
+                context_display = gr.Dataframe(
+                    headers=["Văn bản", "ID người nói"],
+                    label="Ngữ cảnh hiện tại",
+                    interactive=False
+                )
+            generate_btn = gr.Button("Tạo âm thanh", variant="primary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Âm thanh được tạo", type="numpy")
+            error_output = gr.Textbox(label="Thông báo lỗi", visible=False)
+    # Kết nối các sự kiện
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[
+            text_input,
+            speaker_id,
+            max_length,
+            temp,
+            top_k,
+            context_list,
+            context_speakers_list
+        ],
+        outputs=[audio_output, error_output]
+    )
+    add_ctx_btn.click(
+        fn=add_context,
+        inputs=[
+            context_text,
+            context_speaker,
+            context_list,
+            context_speakers_list
+        ],
+        outputs=[context_list, context_speakers_list]
+    )
+    clear_ctx_btn.click(
+        fn=clear_context,
+        inputs=[],
+        outputs=[context_list, context_speakers_list]
+    )
+    # Cập nhật hiển thị ngữ cảnh
+    def update_context_display(texts, speakers):
+        if not texts or not speakers:
+            return []
+        return [[text, speaker] for text, speaker in zip(texts, speakers)]
+    context_list.change(
+        fn=update_context_display,
+        inputs=[context_list, context_speakers_list],
+        outputs=[context_display]
+    )
+    context_speakers_list.change(
+        fn=update_context_display,
+        inputs=[context_list, context_speakers_list],
+        outputs=[context_display]
+    )
+# Khởi động ứng dụng khi tải trang
+initialize_model()
+# Cấu hình cho Hugging Face Spaces
+demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -7,7 +7,7 @@ moshi==0.2.2
 torchtune==0.4.0
 torchao==0.9.0
 silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
-fastapi
-uvicorn[standard]
 python-multipart==0.0.9
 pydantic==2.6.1

 torchtune==0.4.0
 torchao==0.9.0
 silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
+gradio==4.19.2
+numpy>=1.22.0
 python-multipart==0.0.9
 pydantic==2.6.1