gemma-3n

Running on Zero

App Files Files Community

hysts HF Staff commited on 4 days ago

Commit

01e4a7b

1 Parent(s): e0e3138

Add files

Browse files

Files changed (16) hide show

.gitattributes +4 -0
.pre-commit-config.yaml +33 -0
.python-version +1 -0
.vscode/extensions.json +8 -0
.vscode/settings.json +17 -0
README.md +3 -3
app.py +260 -0
assets/cat.jpeg +3 -0
assets/speech.wav +3 -0
assets/speech2.wav +3 -0
pyproject.toml +67 -0
requirements.txt +288 -0
style.css +4 -0
uv.lock +0 -0
wheels/timm-1.0.16.dev0-py3-none-any.whl +3 -0
wheels/transformers-4.53.0.dev0-py3-none-any.whl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.0
+    hooks:
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.16.1
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          [
+            "types-python-slugify",
+            "types-pytz",
+            "types-PyYAML",
+            "types-requests",
+          ]

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

.vscode/extensions.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "streetsidesoftware.code-spell-checker",
+        "tamasfe.even-better-toml"
+    ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "editor.formatOnSave": true,
+    "files.insertFinalNewline": false,
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnType": true,
+        "editor.codeActionsOnSave": {
+            "source.fixAll.ruff": "explicit",
+            "source.organizeImports": "explicit"
+        }
+    },
+    "[jupyter]": {
+        "files.insertFinalNewline": false
+    },
+    "notebook.output.scrolling": true,
+    "notebook.formatOnSave.enabled": true
+}

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Gemma 3n E4B It
-emoji: 🦀
-colorFrom: purple
-colorTo: pink
 sdk: gradio
 sdk_version: 5.34.2
 app_file: app.py

 ---
 title: Gemma 3n E4B It
+emoji: ⚡
+colorFrom: red
+colorTo: purple
 sdk: gradio
 sdk_version: 5.34.2
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import os
+import pathlib
+import shlex
+import subprocess
+import tempfile
+from collections.abc import Iterator
+from threading import Thread
+# TODO: remove this once the transformers implementation is published  # noqa: FIX002, TD002, TD003
+if os.getenv("SPACE_ID"):
+    subprocess.run(shlex.split("pip install wheels/timm-1.0.16.dev0-py3-none-any.whl"), check=True)  # noqa: S603
+    subprocess.run(shlex.split("pip install wheels/transformers-4.53.0.dev0-py3-none-any.whl"), check=True)  # noqa: S603
+import av
+import gradio as gr
+import spaces
+import torch
+from gradio.utils import get_upload_folder
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from transformers.generation.streamers import TextIteratorStreamer
+# TODO: update model_id  # noqa: FIX002, TD002, TD003
+model_id = "gg-hf-gm/gemma-3n-E4B-it"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
+VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
+AUDIO_FILE_TYPES = (".mp3", ".wav")
+GRADIO_TEMP_DIR = get_upload_folder()
+TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
+MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))
+def get_file_type(path: str) -> str:
+    if path.endswith(IMAGE_FILE_TYPES):
+        return "image"
+    if path.endswith(VIDEO_FILE_TYPES):
+        return "video"
+    if path.endswith(AUDIO_FILE_TYPES):
+        return "audio"
+    error_message = f"Unsupported file type: {path}"
+    raise ValueError(error_message)
+def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
+    video_count = 0
+    non_video_count = 0
+    for path in paths:
+        if path.endswith(VIDEO_FILE_TYPES):
+            video_count += 1
+        else:
+            non_video_count += 1
+    return video_count, non_video_count
+def validate_media_constraints(message: dict) -> bool:
+    video_count, non_video_count = count_files_in_new_message(message["files"])
+    if video_count > 1:
+        gr.Warning("Only one video is supported.")
+        return False
+    if video_count == 1 and non_video_count > 0:
+        gr.Warning("Mixing images and videos is not allowed.")
+        return False
+    return True
+def extract_frames_to_tempdir(
+    video_path: str,
+    target_fps: float,
+    max_frames: int | None = None,
+    parent_dir: str | None = None,
+    prefix: str = "frames_",
+) -> str:
+    temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)
+    container = av.open(video_path)
+    video_stream = container.streams.video[0]
+    if video_stream.duration is None or video_stream.time_base is None:
+        raise ValueError("video_stream is missing duration or time_base")
+    time_base = video_stream.time_base
+    duration = float(video_stream.duration * time_base)
+    interval = 1.0 / target_fps
+    total_frames = int(duration * target_fps)
+    if max_frames is not None:
+        total_frames = min(total_frames, max_frames)
+    target_times = [i * interval for i in range(total_frames)]
+    target_index = 0
+    for frame in container.decode(video=0):
+        if frame.pts is None:
+            continue
+        timestamp = float(frame.pts * time_base)
+        if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
+            frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
+            frame.to_image().save(frame_path)
+            target_index += 1
+            if max_frames is not None and target_index >= max_frames:
+                break
+    container.close()
+    return temp_dir
+def process_new_user_message(message: dict) -> list[dict]:
+    if not message["files"]:
+        return [{"type": "text", "text": message["text"]}]
+    file_types = [get_file_type(path) for path in message["files"]]
+    if len(file_types) == 1 and file_types[0] == "video":
+        gr.Info(f"Video will be processed at {TARGET_FPS} FPS, max {MAX_FRAMES} frames in this Space.")
+        temp_dir = extract_frames_to_tempdir(
+            message["files"][0],
+            target_fps=TARGET_FPS,
+            max_frames=MAX_FRAMES,
+            parent_dir=GRADIO_TEMP_DIR,
+        )
+        paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
+        return [
+            {"type": "text", "text": message["text"]},
+            *[{"type": "image", "image": path.as_posix()} for path in paths],
+        ]
+    return [
+        {"type": "text", "text": message["text"]},
+        *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
+    ]
+def process_history(history: list[dict]) -> list[dict]:
+    messages = []
+    current_user_content: list[dict] = []
+    for item in history:
+        if item["role"] == "assistant":
+            if current_user_content:
+                messages.append({"role": "user", "content": current_user_content})
+                current_user_content = []
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
+        else:
+            content = item["content"]
+            if isinstance(content, str):
+                current_user_content.append({"type": "text", "text": content})
+            else:
+                filepath = content[0]
+                file_type = get_file_type(filepath)
+                current_user_content.append({"type": file_type, file_type: filepath})
+    return messages
+@spaces.GPU(duration=120)
+@torch.inference_mode()
+def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
+    if not validate_media_constraints(message):
+        yield ""
+        return
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
+    messages.extend(process_history(history))
+    messages.append({"role": "user", "content": process_new_user_message(message)})
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+    n_tokens = inputs["input_ids"].shape[1]
+    if n_tokens > MAX_INPUT_TOKENS:
+        gr.Warning(
+            f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
+        )
+        yield ""
+        return
+    inputs = inputs.to(device=model.device, dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        disable_compile=True,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    output = ""
+    for delta in streamer:
+        output += delta
+        yield output
+examples = [
+    [
+        {
+            "text": "What is the capital of France?",
+            "files": [],
+        }
+    ],
+    [
+        {
+            "text": "Describe this image in detail.",
+            "files": ["assets/cat.jpeg"],
+        }
+    ],
+    [
+        {
+            "text": "Transcribe the following speech segment in English.",
+            "files": ["assets/speech.wav"],
+        }
+    ],
+    [
+        {
+            "text": "Transcribe the following speech segment in English.",
+            "files": ["assets/speech2.wav"],
+        }
+    ],
+]
+demo = gr.ChatInterface(
+    fn=generate,
+    type="messages",
+    textbox=gr.MultimodalTextbox(
+        file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
+        file_count="multiple",
+        autofocus=True,
+    ),
+    multimodal=True,
+    additional_inputs=[
+        gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
+        gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
+    ],
+    stop_btn=False,
+    title="Gemma 3n E4B it",
+    examples=examples,
+    run_examples_on_click=False,
+    cache_examples=False,
+    css_paths="style.css",
+    delete_cache=(1800, 1800),
+)
+if __name__ == "__main__":
+    demo.launch()

assets/cat.jpeg ADDED Viewed

Git LFS Details

SHA256: 4f4820fd544a706efeca9ad723e0b2185d802ebdc2d719c6823b2340eda0e554
Pointer size: 130 Bytes
Size of remote file: 90.7 kB

assets/speech.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:219f55927c62af71fa7dc29581b647ade375286498dd0e342cc07c72cc8edc04
+size 136764

assets/speech2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f03711ca239a7b6f3cd13f3942f4f64b32173f4f1d01abf55fa91608a178efac
+size 545644

pyproject.toml ADDED Viewed

	@@ -0,0 +1,67 @@

+[project]
+name = "gemma-3n-e4b-it"
+version = "0.1.0"
+description = ""
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.8.1",
+    "av>=14.4.0",
+    "gradio>=5.34.2",
+    "hf-transfer>=0.1.9",
+    "librosa>=0.11.0",
+    "spaces>=0.37.1",
+    "torch==2.5.1",
+    "torchvision>=0.20.1",
+]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "PD901",   # pandas-df-variable-name
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = ["T201", "T203"]
+[tool.ruff.format]
+docstring-code-format = true
+[tool.uv.sources]
+timm = { path = "wheels/timm-1.0.16.dev0-py3-none-any.whl" }
+transformers = { path = "wheels/transformers-4.53.0.dev0-py3-none-any.whl" }
+[dependency-groups]
+dev = [
+    "timm",
+    "transformers",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,288 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+accelerate==1.8.1
+    # via gemma-3n-e4b-it (pyproject.toml)
+aiofiles==24.1.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+audioread==3.0.1
+    # via librosa
+av==14.4.0
+    # via gemma-3n-e4b-it (pyproject.toml)
+certifi==2025.6.15
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1
+    # via soundfile
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   typer
+    #   uvicorn
+decorator==5.2.1
+    # via librosa
+exceptiongroup==1.3.0
+    # via anyio
+fastapi==0.115.13
+    # via gradio
+ffmpy==0.6.0
+    # via gradio
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   triton
+fsspec==2025.5.1
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.34.2
+    # via
+    #   gemma-3n-e4b-it (pyproject.toml)
+    #   spaces
+gradio-client==1.10.3
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-transfer==0.1.9
+    # via gemma-3n-e4b-it (pyproject.toml)
+hf-xet==1.1.5
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+    #   spaces
+huggingface-hub==0.33.0
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.1
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.4
+    # via librosa
+librosa==0.11.0
+    # via gemma-3n-e4b-it (pyproject.toml)
+llvmlite==0.44.0
+    # via numba
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.1
+    # via librosa
+networkx==3.4.2
+    # via torch
+numba==0.61.2
+    # via librosa
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   gradio
+    #   librosa
+    #   numba
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   torchvision
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   pooch
+    #   spaces
+pandas==2.3.0
+    # via gradio
+pillow==11.2.1
+    # via
+    #   gradio
+    #   torchvision
+platformdirs==4.3.8
+    # via pooch
+pooch==1.8.2
+    # via librosa
+psutil==5.9.8
+    # via
+    #   accelerate
+    #   spaces
+pycparser==2.22
+    # via cffi
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   gradio
+    #   spaces
+pydantic-core==2.33.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   gradio
+    #   huggingface-hub
+requests==2.32.4
+    # via
+    #   huggingface-hub
+    #   pooch
+    #   spaces
+rich==14.0.0
+    # via typer
+ruff==0.12.0
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via accelerate
+scikit-learn==1.7.0
+    # via librosa
+scipy==1.15.3
+    # via
+    #   librosa
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+soundfile==0.13.1
+    # via librosa
+soxr==0.5.0.post1
+    # via librosa
+spaces==0.37.1
+    # via gemma-3n-e4b-it (pyproject.toml)
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.13.1
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tomlkit==0.13.3
+    # via gradio
+torch==2.5.1
+    # via
+    #   gemma-3n-e4b-it (pyproject.toml)
+    #   accelerate
+    #   torchvision
+torchvision==0.20.1
+    # via gemma-3n-e4b-it (pyproject.toml)
+tqdm==4.67.1
+    # via huggingface-hub
+triton==3.1.0
+    # via torch
+typer==0.16.0
+    # via gradio
+typing-extensions==4.14.0
+    # via
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   spaces
+    #   torch
+    #   typer
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via requests
+uvicorn==0.34.3
+    # via gradio
+websockets==15.0.1
+    # via gradio-client

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+  display: block;
+}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

wheels/timm-1.0.16.dev0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9686fd88dfc74611ac0eb9c6a8598ea33f9cfc8cb1a7477cd9c765c659bae9
+size 2485728

wheels/transformers-4.53.0.dev0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f2c9040adc3ac4574b20aaf04883ba8a5f0ef6fc62b1d1ca24744369205fdd5
+size 11532544