Spaces:
Paused
Paused
Initial commit
Browse files- Dockerfile +79 -0
- app.py +21 -0
- attempts.txt +0 -0
- change_hardware.py +48 -0
- entrypoint.sh +36 -0
- failed_attempts.txt +0 -0
- hardware.txt +1 -0
- models.txt +1 -0
- pause_space.py +7 -0
- requirements.txt +10 -0
- tasks.txt +1 -0
Dockerfile
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
2 |
+
|
3 |
+
ARG PYTORCH_VERSION=2.4.0
|
4 |
+
ARG PYTHON_VERSION=3.9
|
5 |
+
ARG CUDA_VERSION=12.1
|
6 |
+
ARG MAMBA_VERSION=24.3.0-0
|
7 |
+
ARG CUDA_CHANNEL=nvidia
|
8 |
+
ARG INSTALL_CHANNEL=pytorch
|
9 |
+
# Automatically set by buildx
|
10 |
+
ARG TARGETPLATFORM
|
11 |
+
|
12 |
+
|
13 |
+
#ENV HOME=/home/user \
|
14 |
+
# PATH=/home/user/.local/bin:/opt/conda/bin:$PATH
|
15 |
+
|
16 |
+
ENV PATH=/opt/conda/bin:$PATH
|
17 |
+
|
18 |
+
RUN mkdir -p .cache
|
19 |
+
#RUN mkdir -p data
|
20 |
+
# I'm not sure how to allow later python files used here to write to .cache without making it world-writeable.
|
21 |
+
RUN chmod 777 -R .cache
|
22 |
+
#RUN chmod 777 -R data
|
23 |
+
|
24 |
+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
25 |
+
build-essential \
|
26 |
+
ca-certificates \
|
27 |
+
ccache \
|
28 |
+
curl \
|
29 |
+
python3 \
|
30 |
+
python3-pip \
|
31 |
+
git && \
|
32 |
+
rm -rf /var/lib/apt/lists/*
|
33 |
+
|
34 |
+
# Install conda
|
35 |
+
# translating Docker's TARGETPLATFORM into mamba arches
|
36 |
+
RUN case ${TARGETPLATFORM} in \
|
37 |
+
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
|
38 |
+
*) MAMBA_ARCH=x86_64 ;; \
|
39 |
+
esac && \
|
40 |
+
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
|
41 |
+
RUN chmod +x ~/mambaforge.sh && \
|
42 |
+
bash ~/mambaforge.sh -b -p /opt/conda && \
|
43 |
+
rm ~/mambaforge.sh
|
44 |
+
|
45 |
+
# Install pytorch
|
46 |
+
# On arm64 we exit with an error code
|
47 |
+
RUN case ${TARGETPLATFORM} in \
|
48 |
+
"linux/arm64") exit 1 ;; \
|
49 |
+
*) /opt/conda/bin/conda update -y conda && \
|
50 |
+
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
|
51 |
+
esac && \
|
52 |
+
/opt/conda/bin/conda clean -ya
|
53 |
+
|
54 |
+
COPY ./requirements.txt requirements.txt
|
55 |
+
RUN pip install -r requirements.txt
|
56 |
+
|
57 |
+
RUN git clone -b energy_star_dev https://github.com/huggingface/optimum-benchmark.git /optimum-benchmark && cd optimum-benchmark && pip install -e .
|
58 |
+
|
59 |
+
COPY ./*.txt /
|
60 |
+
COPY ./.cache /.cache
|
61 |
+
COPY ./entrypoint.sh /entrypoint.sh
|
62 |
+
COPY ./pause_space.py /pause_space.py
|
63 |
+
COPY ./parse_requests.py /parse_requests.py
|
64 |
+
COPY ./process_runs.py /process_runs.py
|
65 |
+
COPY ./app/runs /app/runs
|
66 |
+
|
67 |
+
RUN chmod 777 *.py
|
68 |
+
RUN chmod 777 -R /app/runs
|
69 |
+
RUN chmod 777 -R /.cache
|
70 |
+
RUN chmod 777 /attempts.txt
|
71 |
+
RUN chmod 777 /failed_attempts.txt
|
72 |
+
RUN chmod +x /entrypoint.sh
|
73 |
+
|
74 |
+
# Expose the secret token at buildtime and use its value as git remote URL
|
75 |
+
RUN --mount=type=secret,id=BULK_ENERGY_TOKEN,mode=0444,required=true \
|
76 |
+
git init && \
|
77 |
+
git remote add origin $(cat /run/secrets/BULK_ENERGY_TOKEN)
|
78 |
+
|
79 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from fastapi import FastAPI
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
TOKEN = os.environ.get("BULK_ENERGY_TOKEN")
|
6 |
+
API = HfApi(token=TOKEN)
|
7 |
+
|
8 |
+
REPO_ID = "AIEnergyScore/BulkCalcSpace"
|
9 |
+
app = FastAPI()
|
10 |
+
|
11 |
+
@app.get("/")
|
12 |
+
def start_train():
|
13 |
+
space_variables = API.get_space_variables(repo_id=REPO_ID)
|
14 |
+
if 'STATUS' in space_variables and space_variables['STATUS'] != 'COMPUTING':
|
15 |
+
print("Beginning processing.")
|
16 |
+
API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='COMPUTING')
|
17 |
+
os.system(f"./entrypoint.sh {REPO_ID}")
|
18 |
+
API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='NOT_COMPUTING')
|
19 |
+
print("Pausing space")
|
20 |
+
API.pause_space(REPO_ID)
|
21 |
+
return {"Status": space_variables['STATUS']}
|
attempts.txt
ADDED
File without changes
|
change_hardware.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
REPO_ID = "AIEnergyScore/benchmark-hugs-models"
|
6 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
7 |
+
API = HfApi(token=TOKEN)
|
8 |
+
|
9 |
+
def parse_args():
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument(
|
12 |
+
"--hardware",
|
13 |
+
default="a10g-large",
|
14 |
+
type=str,
|
15 |
+
required=False,
|
16 |
+
help="hardware to use for benchmarking.",
|
17 |
+
)
|
18 |
+
args = parser.parse_args()
|
19 |
+
return args
|
20 |
+
# Based on huggingface_hub _space_api.py
|
21 |
+
# CPU_BASIC = "cpu-basic"
|
22 |
+
# CPU_UPGRADE = "cpu-upgrade"
|
23 |
+
# T4_SMALL = "t4-small"
|
24 |
+
# T4_MEDIUM = "t4-medium"
|
25 |
+
# L4X1 = "l4x1"
|
26 |
+
# L4X4 = "l4x4"
|
27 |
+
# ZERO_A10G = "zero-a10g"
|
28 |
+
# A10G_SMALL = "a10g-small"
|
29 |
+
# A10G_LARGE = "a10g-large"
|
30 |
+
# A10G_LARGEX2 = "a10g-largex2"
|
31 |
+
# A10G_LARGEX4 = "a10g-largex4"
|
32 |
+
# A100_LARGE = "a100-large"
|
33 |
+
# V5E_1X1 = "v5e-1x1"
|
34 |
+
# V5E_2X2 = "v5e-2x2"
|
35 |
+
# V5E_2X4 = "v5e-2x4"
|
36 |
+
#curr_runtime = API.get_space_runtime(repo_id=REPO_ID)
|
37 |
+
#print(curr_runtime)
|
38 |
+
#requested_hardware = curr_runtime.requested_hardware
|
39 |
+
#print(requested_hardware)
|
40 |
+
#hardware_idx = hardware_options.index(requested_hardware)
|
41 |
+
#next_hardware = hardware_options[hardware_idx + 1]
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
args = parse_args()
|
45 |
+
curr_runtime = API.get_space_runtime(repo_id=REPO_ID)
|
46 |
+
curr_hardware = curr_runtime.requested_hardware
|
47 |
+
if curr_hardware != args.hardware:
|
48 |
+
API.request_space_hardware(repo_id=REPO_ID, hardware=args.hardware)
|
entrypoint.sh
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
config_dir="/optimum-benchmark/examples/energy_star/"
|
4 |
+
|
5 |
+
# This script is meant to be called from a python script \
|
6 |
+
# that provides the REPO_ID as the first argument.
|
7 |
+
REPO_ID = $1
|
8 |
+
echo "Attempting to run."
|
9 |
+
cat /tasks.txt | while read -r task; do # Example alternative: for task in "text_generation" "question_answering"; do
|
10 |
+
cat /hardware.txt | while read -r hardware; do # Example alternative: for hardware in "a100-large" "l4x1" "l40sx1"; do
|
11 |
+
echo "Attempting to benchmark ${hardware}"
|
12 |
+
python /change_hardware.py --hardware ${hardware}
|
13 |
+
# For each model
|
14 |
+
cat /models.txt | while read -r model; do # Example alternative: for model in "NousResearch/Hermes-3-Llama-3.1-8B" "Qwen/Qwen2.5-7B-Instruct"; do
|
15 |
+
# Read the name of the model and the experiment.
|
16 |
+
echo "Benchmarking Model: ${model}, Task: ${task}, Hardware: ${hardware}"
|
17 |
+
|
18 |
+
# Initialize the directory for output.
|
19 |
+
now=$(date +%Y-%m-%d-%H-%M-%S)
|
20 |
+
run_dir="/app/runs/${task}/${model}/${now}"
|
21 |
+
mkdir -p "$run_dir"
|
22 |
+
# Save the task/model run directory to text file, for tracking purposes.
|
23 |
+
echo "${run_dir}" >> /attempts.txt
|
24 |
+
|
25 |
+
{ # try
|
26 |
+
# Let the benchmarking begin!
|
27 |
+
optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
|
28 |
+
} || { # catch
|
29 |
+
echo "${run_dir}" >> /failed_attempts.txt
|
30 |
+
}
|
31 |
+
done
|
32 |
+
done
|
33 |
+
done
|
34 |
+
|
35 |
+
echo "Finished"# updating requests dataset and results dataset."
|
36 |
+
#python /process_runs.py
|
failed_attempts.txt
ADDED
File without changes
|
hardware.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
a100-large
|
models.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
NousResearch/Hermes-3-Llama-3.1-8B
|
pause_space.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import HfApi
|
2 |
+
import os
|
3 |
+
#REPO_ID =
|
4 |
+
#API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='COMPUTING')
|
5 |
+
TOKEN = os.environ.get("BULK_COMPUTE_SPACE")
|
6 |
+
API = HfApi(token=TOKEN)
|
7 |
+
API.pause_space("AIEnergyScore/launch-computation-example")
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.33.0
|
2 |
+
codecarbon==2.5.1
|
3 |
+
datasets==2.20.0
|
4 |
+
diffusers==0.30.0
|
5 |
+
huggingface-hub==0.24.5
|
6 |
+
librosa==0.10.1
|
7 |
+
omegaconf==2.3.0
|
8 |
+
# optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark@energy_star_dev
|
9 |
+
torch==2.4.0
|
10 |
+
transformers==4.44.0
|
tasks.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
text_generation
|