meg HF Staff commited on
Commit
13c6f75
·
verified ·
1 Parent(s): 21e1e2b

Initial commit

Browse files
Files changed (11) hide show
  1. Dockerfile +79 -0
  2. app.py +21 -0
  3. attempts.txt +0 -0
  4. change_hardware.py +48 -0
  5. entrypoint.sh +36 -0
  6. failed_attempts.txt +0 -0
  7. hardware.txt +1 -0
  8. models.txt +1 -0
  9. pause_space.py +7 -0
  10. requirements.txt +10 -0
  11. tasks.txt +1 -0
Dockerfile ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
2
+
3
+ ARG PYTORCH_VERSION=2.4.0
4
+ ARG PYTHON_VERSION=3.9
5
+ ARG CUDA_VERSION=12.1
6
+ ARG MAMBA_VERSION=24.3.0-0
7
+ ARG CUDA_CHANNEL=nvidia
8
+ ARG INSTALL_CHANNEL=pytorch
9
+ # Automatically set by buildx
10
+ ARG TARGETPLATFORM
11
+
12
+
13
+ #ENV HOME=/home/user \
14
+ # PATH=/home/user/.local/bin:/opt/conda/bin:$PATH
15
+
16
+ ENV PATH=/opt/conda/bin:$PATH
17
+
18
+ RUN mkdir -p .cache
19
+ #RUN mkdir -p data
20
+ # I'm not sure how to allow later python files used here to write to .cache without making it world-writeable.
21
+ RUN chmod 777 -R .cache
22
+ #RUN chmod 777 -R data
23
+
24
+ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
25
+ build-essential \
26
+ ca-certificates \
27
+ ccache \
28
+ curl \
29
+ python3 \
30
+ python3-pip \
31
+ git && \
32
+ rm -rf /var/lib/apt/lists/*
33
+
34
+ # Install conda
35
+ # translating Docker's TARGETPLATFORM into mamba arches
36
+ RUN case ${TARGETPLATFORM} in \
37
+ "linux/arm64") MAMBA_ARCH=aarch64 ;; \
38
+ *) MAMBA_ARCH=x86_64 ;; \
39
+ esac && \
40
+ curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
41
+ RUN chmod +x ~/mambaforge.sh && \
42
+ bash ~/mambaforge.sh -b -p /opt/conda && \
43
+ rm ~/mambaforge.sh
44
+
45
+ # Install pytorch
46
+ # On arm64 we exit with an error code
47
+ RUN case ${TARGETPLATFORM} in \
48
+ "linux/arm64") exit 1 ;; \
49
+ *) /opt/conda/bin/conda update -y conda && \
50
+ /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
51
+ esac && \
52
+ /opt/conda/bin/conda clean -ya
53
+
54
+ COPY ./requirements.txt requirements.txt
55
+ RUN pip install -r requirements.txt
56
+
57
+ RUN git clone -b energy_star_dev https://github.com/huggingface/optimum-benchmark.git /optimum-benchmark && cd optimum-benchmark && pip install -e .
58
+
59
+ COPY ./*.txt /
60
+ COPY ./.cache /.cache
61
+ COPY ./entrypoint.sh /entrypoint.sh
62
+ COPY ./pause_space.py /pause_space.py
63
+ COPY ./parse_requests.py /parse_requests.py
64
+ COPY ./process_runs.py /process_runs.py
65
+ COPY ./app/runs /app/runs
66
+
67
+ RUN chmod 777 *.py
68
+ RUN chmod 777 -R /app/runs
69
+ RUN chmod 777 -R /.cache
70
+ RUN chmod 777 /attempts.txt
71
+ RUN chmod 777 /failed_attempts.txt
72
+ RUN chmod +x /entrypoint.sh
73
+
74
+ # Expose the secret token at buildtime and use its value as git remote URL
75
+ RUN --mount=type=secret,id=BULK_ENERGY_TOKEN,mode=0444,required=true \
76
+ git init && \
77
+ git remote add origin $(cat /run/secrets/BULK_ENERGY_TOKEN)
78
+
79
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI
3
+ from huggingface_hub import HfApi
4
+
5
+ TOKEN = os.environ.get("BULK_ENERGY_TOKEN")
6
+ API = HfApi(token=TOKEN)
7
+
8
+ REPO_ID = "AIEnergyScore/BulkCalcSpace"
9
+ app = FastAPI()
10
+
11
+ @app.get("/")
12
+ def start_train():
13
+ space_variables = API.get_space_variables(repo_id=REPO_ID)
14
+ if 'STATUS' in space_variables and space_variables['STATUS'] != 'COMPUTING':
15
+ print("Beginning processing.")
16
+ API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='COMPUTING')
17
+ os.system(f"./entrypoint.sh {REPO_ID}")
18
+ API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='NOT_COMPUTING')
19
+ print("Pausing space")
20
+ API.pause_space(REPO_ID)
21
+ return {"Status": space_variables['STATUS']}
attempts.txt ADDED
File without changes
change_hardware.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from huggingface_hub import HfApi
4
+
5
+ REPO_ID = "AIEnergyScore/benchmark-hugs-models"
6
+ TOKEN = os.environ.get("HF_TOKEN")
7
+ API = HfApi(token=TOKEN)
8
+
9
+ def parse_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument(
12
+ "--hardware",
13
+ default="a10g-large",
14
+ type=str,
15
+ required=False,
16
+ help="hardware to use for benchmarking.",
17
+ )
18
+ args = parser.parse_args()
19
+ return args
20
+ # Based on huggingface_hub _space_api.py
21
+ # CPU_BASIC = "cpu-basic"
22
+ # CPU_UPGRADE = "cpu-upgrade"
23
+ # T4_SMALL = "t4-small"
24
+ # T4_MEDIUM = "t4-medium"
25
+ # L4X1 = "l4x1"
26
+ # L4X4 = "l4x4"
27
+ # ZERO_A10G = "zero-a10g"
28
+ # A10G_SMALL = "a10g-small"
29
+ # A10G_LARGE = "a10g-large"
30
+ # A10G_LARGEX2 = "a10g-largex2"
31
+ # A10G_LARGEX4 = "a10g-largex4"
32
+ # A100_LARGE = "a100-large"
33
+ # V5E_1X1 = "v5e-1x1"
34
+ # V5E_2X2 = "v5e-2x2"
35
+ # V5E_2X4 = "v5e-2x4"
36
+ #curr_runtime = API.get_space_runtime(repo_id=REPO_ID)
37
+ #print(curr_runtime)
38
+ #requested_hardware = curr_runtime.requested_hardware
39
+ #print(requested_hardware)
40
+ #hardware_idx = hardware_options.index(requested_hardware)
41
+ #next_hardware = hardware_options[hardware_idx + 1]
42
+
43
+ if __name__ == '__main__':
44
+ args = parse_args()
45
+ curr_runtime = API.get_space_runtime(repo_id=REPO_ID)
46
+ curr_hardware = curr_runtime.requested_hardware
47
+ if curr_hardware != args.hardware:
48
+ API.request_space_hardware(repo_id=REPO_ID, hardware=args.hardware)
entrypoint.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ config_dir="/optimum-benchmark/examples/energy_star/"
4
+
5
+ # This script is meant to be called from a python script \
6
+ # that provides the REPO_ID as the first argument.
7
+ REPO_ID = $1
8
+ echo "Attempting to run."
9
+ cat /tasks.txt | while read -r task; do # Example alternative: for task in "text_generation" "question_answering"; do
10
+ cat /hardware.txt | while read -r hardware; do # Example alternative: for hardware in "a100-large" "l4x1" "l40sx1"; do
11
+ echo "Attempting to benchmark ${hardware}"
12
+ python /change_hardware.py --hardware ${hardware}
13
+ # For each model
14
+ cat /models.txt | while read -r model; do # Example alternative: for model in "NousResearch/Hermes-3-Llama-3.1-8B" "Qwen/Qwen2.5-7B-Instruct"; do
15
+ # Read the name of the model and the experiment.
16
+ echo "Benchmarking Model: ${model}, Task: ${task}, Hardware: ${hardware}"
17
+
18
+ # Initialize the directory for output.
19
+ now=$(date +%Y-%m-%d-%H-%M-%S)
20
+ run_dir="/app/runs/${task}/${model}/${now}"
21
+ mkdir -p "$run_dir"
22
+ # Save the task/model run directory to text file, for tracking purposes.
23
+ echo "${run_dir}" >> /attempts.txt
24
+
25
+ { # try
26
+ # Let the benchmarking begin!
27
+ optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
28
+ } || { # catch
29
+ echo "${run_dir}" >> /failed_attempts.txt
30
+ }
31
+ done
32
+ done
33
+ done
34
+
35
+ echo "Finished"# updating requests dataset and results dataset."
36
+ #python /process_runs.py
failed_attempts.txt ADDED
File without changes
hardware.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ a100-large
models.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ NousResearch/Hermes-3-Llama-3.1-8B
pause_space.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ import os
3
+ #REPO_ID =
4
+ #API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='COMPUTING')
5
+ TOKEN = os.environ.get("BULK_COMPUTE_SPACE")
6
+ API = HfApi(token=TOKEN)
7
+ API.pause_space("AIEnergyScore/launch-computation-example")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.33.0
2
+ codecarbon==2.5.1
3
+ datasets==2.20.0
4
+ diffusers==0.30.0
5
+ huggingface-hub==0.24.5
6
+ librosa==0.10.1
7
+ omegaconf==2.3.0
8
+ # optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark@energy_star_dev
9
+ torch==2.4.0
10
+ transformers==4.44.0
tasks.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ text_generation