Commit
·
5f3f012
1
Parent(s):
7a9f54b
feat: auto pause hugsim server
Browse files- competitions/runner.py +14 -1
- competitions/utils.py +39 -8
- requirements.txt +1 -0
competitions/runner.py
CHANGED
|
@@ -5,17 +5,21 @@ import os
|
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
import shutil
|
|
|
|
| 8 |
from dataclasses import dataclass
|
| 9 |
from typing import List, Dict, Any
|
| 10 |
from collections import defaultdict
|
| 11 |
|
|
|
|
| 12 |
import pandas as pd
|
| 13 |
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
| 14 |
from loguru import logger
|
| 15 |
|
| 16 |
from competitions.enums import SubmissionStatus, ErrorMessage
|
| 17 |
from competitions.info import CompetitionInfo
|
| 18 |
-
from competitions.utils import user_token_api, space_cleaner, dockerfile_modifier
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
@dataclass
|
|
@@ -219,6 +223,7 @@ class JobRunner:
|
|
| 219 |
shutil.rmtree(client_code_local_dir, ignore_errors=True)
|
| 220 |
self._queue_submission(team_id, submission_id)
|
| 221 |
|
|
|
|
| 222 |
def run(self):
|
| 223 |
cur = 0
|
| 224 |
while True:
|
|
@@ -231,6 +236,7 @@ class JobRunner:
|
|
| 231 |
# Clean up spaces every 100 iterations
|
| 232 |
if cur % 100 == 1:
|
| 233 |
logger.info("Cleaning up spaces...")
|
|
|
|
| 234 |
for space in all_submissions:
|
| 235 |
if space["status"] in {SubmissionStatus.QUEUED.value, SubmissionStatus.PROCESSING.value}:
|
| 236 |
logger.info(f"Cleaning up space {space['space_id']} for submission {space['submission_id']}")
|
|
@@ -239,6 +245,11 @@ class JobRunner:
|
|
| 239 |
space["team_id"],
|
| 240 |
space["submission_id"],
|
| 241 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
pending_submissions = self._get_pending_subs(all_submissions)
|
| 244 |
if pending_submissions is None:
|
|
@@ -249,6 +260,8 @@ class JobRunner:
|
|
| 249 |
if server_active_count[first_pending_sub["server_url"]] >= 1:
|
| 250 |
continue
|
| 251 |
try:
|
|
|
|
|
|
|
| 252 |
self.create_space(first_pending_sub["team_id"], first_pending_sub["submission_id"], first_pending_sub["submission_repo"], first_pending_sub["space_id"], first_pending_sub["server_url"], first_pending_sub["hardware"])
|
| 253 |
except Exception as e:
|
| 254 |
logger.error(
|
|
|
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
import shutil
|
| 8 |
+
import os
|
| 9 |
from dataclasses import dataclass
|
| 10 |
from typing import List, Dict, Any
|
| 11 |
from collections import defaultdict
|
| 12 |
|
| 13 |
+
import tenacity
|
| 14 |
import pandas as pd
|
| 15 |
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
| 16 |
from loguru import logger
|
| 17 |
|
| 18 |
from competitions.enums import SubmissionStatus, ErrorMessage
|
| 19 |
from competitions.info import CompetitionInfo
|
| 20 |
+
from competitions.utils import user_token_api, space_cleaner, dockerfile_modifier, server_manager
|
| 21 |
+
|
| 22 |
+
IS_AUTO_PAUSE_HUGSIM_SERVER = os.getenv("IS_AUTO_PAUSE_HUGSIM_SERVER", "true").lower() == "true"
|
| 23 |
|
| 24 |
|
| 25 |
@dataclass
|
|
|
|
| 223 |
shutil.rmtree(client_code_local_dir, ignore_errors=True)
|
| 224 |
self._queue_submission(team_id, submission_id)
|
| 225 |
|
| 226 |
+
@tenacity.retry(stop=tenacity.stop_never, wait=tenacity.wait_fixed(15))
|
| 227 |
def run(self):
|
| 228 |
cur = 0
|
| 229 |
while True:
|
|
|
|
| 236 |
# Clean up spaces every 100 iterations
|
| 237 |
if cur % 100 == 1:
|
| 238 |
logger.info("Cleaning up spaces...")
|
| 239 |
+
ready_submissions_count = 0
|
| 240 |
for space in all_submissions:
|
| 241 |
if space["status"] in {SubmissionStatus.QUEUED.value, SubmissionStatus.PROCESSING.value}:
|
| 242 |
logger.info(f"Cleaning up space {space['space_id']} for submission {space['submission_id']}")
|
|
|
|
| 245 |
space["team_id"],
|
| 246 |
space["submission_id"],
|
| 247 |
)
|
| 248 |
+
if space["status"] in {SubmissionStatus.QUEUED.value, SubmissionStatus.PROCESSING.value, SubmissionStatus.PROCESSING.value}:
|
| 249 |
+
ready_submissions_count += 1
|
| 250 |
+
if ready_submissions_count == 0:
|
| 251 |
+
if IS_AUTO_PAUSE_HUGSIM_SERVER:
|
| 252 |
+
server_manager.pause_all_servers()
|
| 253 |
|
| 254 |
pending_submissions = self._get_pending_subs(all_submissions)
|
| 255 |
if pending_submissions is None:
|
|
|
|
| 260 |
if server_active_count[first_pending_sub["server_url"]] >= 1:
|
| 261 |
continue
|
| 262 |
try:
|
| 263 |
+
if IS_AUTO_PAUSE_HUGSIM_SERVER:
|
| 264 |
+
server_manager.start_all_servers()
|
| 265 |
self.create_space(first_pending_sub["team_id"], first_pending_sub["submission_id"], first_pending_sub["submission_repo"], first_pending_sub["space_id"], first_pending_sub["server_url"], first_pending_sub["hardware"])
|
| 266 |
except Exception as e:
|
| 267 |
logger.error(
|
competitions/utils.py
CHANGED
|
@@ -8,10 +8,12 @@ import threading
|
|
| 8 |
import uuid
|
| 9 |
import base64
|
| 10 |
import glob
|
|
|
|
| 11 |
from typing import Optional, Dict, Any, List, Literal, Tuple
|
| 12 |
from collections import defaultdict
|
| 13 |
from datetime import datetime, timezone, timedelta
|
| 14 |
|
|
|
|
| 15 |
import requests
|
| 16 |
import pandas as pd
|
| 17 |
import jwt
|
|
@@ -485,8 +487,11 @@ user_token_api = UserTokenApi(
|
|
| 485 |
|
| 486 |
|
| 487 |
class ServerManager:
|
| 488 |
-
def __init__(self, server_url_list: List[str]):
|
|
|
|
|
|
|
| 489 |
self.server_url_list = server_url_list
|
|
|
|
| 490 |
self._cur_index = 0
|
| 491 |
self._lock = threading.Lock()
|
| 492 |
|
|
@@ -496,8 +501,37 @@ class ServerManager:
|
|
| 496 |
self._cur_index = (self._cur_index + 1) % len(self.server_url_list)
|
| 497 |
return server_url
|
| 498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
-
server_manager = ServerManager(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
|
| 503 |
class SubmissionApi:
|
|
@@ -562,8 +596,7 @@ class SubmissionApi:
|
|
| 562 |
|
| 563 |
def count_by_status(self, team_id: str, status_list: List[SubmissionStatus]) -> int:
|
| 564 |
user_submission_info = self.download_submission_info(team_id)
|
| 565 |
-
count = sum(1 for submission in user_submission_info["submissions"] if
|
| 566 |
-
SubmissionStatus(submission["status"]) in status_list)
|
| 567 |
return count
|
| 568 |
|
| 569 |
|
|
@@ -684,8 +717,7 @@ class SpaceCleaner:
|
|
| 684 |
submission_api.update_submission_data(
|
| 685 |
team_id=team_id,
|
| 686 |
submission_id=submission_id,
|
| 687 |
-
data={"status": SubmissionStatus.FAILED.value,
|
| 688 |
-
"error_message": ErrorMessage.BUILD_SPACE_FAILED.value},
|
| 689 |
)
|
| 690 |
else:
|
| 691 |
self.api.restart_space(repo_id=space_id)
|
|
@@ -923,8 +955,7 @@ class DockerfileModifier:
|
|
| 923 |
if last_user is None:
|
| 924 |
modified_lines.insert(-1, f"COPY {self.source_so_path}" + f" {self.tatget_so_path}")
|
| 925 |
else:
|
| 926 |
-
modified_lines.insert(-1,
|
| 927 |
-
f"COPY --chown={last_user}:{last_user} {self.source_so_path} {self.tatget_so_path}")
|
| 928 |
modified_lines.insert(-1, f"RUN chown -R {last_user}:{last_user} {self.tatget_so_dir}")
|
| 929 |
|
| 930 |
return '\n'.join(modified_lines), changes
|
|
|
|
| 8 |
import uuid
|
| 9 |
import base64
|
| 10 |
import glob
|
| 11 |
+
import time
|
| 12 |
from typing import Optional, Dict, Any, List, Literal, Tuple
|
| 13 |
from collections import defaultdict
|
| 14 |
from datetime import datetime, timezone, timedelta
|
| 15 |
|
| 16 |
+
import tenacity
|
| 17 |
import requests
|
| 18 |
import pandas as pd
|
| 19 |
import jwt
|
|
|
|
| 487 |
|
| 488 |
|
| 489 |
class ServerManager:
|
| 490 |
+
def __init__(self, hf_token: str, server_url_list: List[str], space_name_list: List[str]):
|
| 491 |
+
self.hf_token = hf_token
|
| 492 |
+
self.api = HfApi(token=hf_token)
|
| 493 |
self.server_url_list = server_url_list
|
| 494 |
+
self.space_name_list = space_name_list
|
| 495 |
self._cur_index = 0
|
| 496 |
self._lock = threading.Lock()
|
| 497 |
|
|
|
|
| 501 |
self._cur_index = (self._cur_index + 1) % len(self.server_url_list)
|
| 502 |
return server_url
|
| 503 |
|
| 504 |
+
def pause_all_servers(self):
|
| 505 |
+
"""Pause all servers."""
|
| 506 |
+
for space_name in self.space_name_list:
|
| 507 |
+
try:
|
| 508 |
+
self.api.pause_space(repo_id=space_name)
|
| 509 |
+
except Exception as e:
|
| 510 |
+
logger.error(f"Failed to pause space {space_name} - {e}")
|
| 511 |
+
|
| 512 |
+
@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_fixed(15))
|
| 513 |
+
def start_all_servers(self):
|
| 514 |
+
"""Start all servers."""
|
| 515 |
+
for space_name in self.space_name_list:
|
| 516 |
+
stage = self.api.space_info(space_name).runtime.stage.lower()
|
| 517 |
+
if stage == "running":
|
| 518 |
+
continue
|
| 519 |
+
self.api.restart_space(repo_id=space_name)
|
| 520 |
+
while True:
|
| 521 |
+
time.sleep(10)
|
| 522 |
+
stage = self.api.space_info(space_name).runtime.stage.lower()
|
| 523 |
+
if "error" in stage:
|
| 524 |
+
self.api.restart_space(repo_id=space_name)
|
| 525 |
+
continue
|
| 526 |
+
if stage == "running":
|
| 527 |
+
break
|
| 528 |
+
|
| 529 |
|
| 530 |
+
server_manager = ServerManager(
|
| 531 |
+
os.environ.get("HF_TOKEN", None),
|
| 532 |
+
["https://xdimlab-hugsim-web-server-0.hf.space"],
|
| 533 |
+
["XDimLab/hugsim_web_server_0"]
|
| 534 |
+
)
|
| 535 |
|
| 536 |
|
| 537 |
class SubmissionApi:
|
|
|
|
| 596 |
|
| 597 |
def count_by_status(self, team_id: str, status_list: List[SubmissionStatus]) -> int:
|
| 598 |
user_submission_info = self.download_submission_info(team_id)
|
| 599 |
+
count = sum(1 for submission in user_submission_info["submissions"] if SubmissionStatus(submission["status"]) in status_list)
|
|
|
|
| 600 |
return count
|
| 601 |
|
| 602 |
|
|
|
|
| 717 |
submission_api.update_submission_data(
|
| 718 |
team_id=team_id,
|
| 719 |
submission_id=submission_id,
|
| 720 |
+
data={"status": SubmissionStatus.FAILED.value, "error_message": ErrorMessage.BUILD_SPACE_FAILED.value},
|
|
|
|
| 721 |
)
|
| 722 |
else:
|
| 723 |
self.api.restart_space(repo_id=space_id)
|
|
|
|
| 955 |
if last_user is None:
|
| 956 |
modified_lines.insert(-1, f"COPY {self.source_so_path}" + f" {self.tatget_so_path}")
|
| 957 |
else:
|
| 958 |
+
modified_lines.insert(-1, f"COPY --chown={last_user}:{last_user} {self.source_so_path} {self.tatget_so_path}")
|
|
|
|
| 959 |
modified_lines.insert(-1, f"RUN chown -R {last_user}:{last_user} {self.tatget_so_dir}")
|
| 960 |
|
| 961 |
return '\n'.join(modified_lines), changes
|
requirements.txt
CHANGED
|
@@ -18,3 +18,4 @@ hf-transfer>=0.1.9
|
|
| 18 |
filelock>=3.13.3
|
| 19 |
cachetools==6.0.0
|
| 20 |
PyJWT>=2.10.1
|
|
|
|
|
|
| 18 |
filelock>=3.13.3
|
| 19 |
cachetools==6.0.0
|
| 20 |
PyJWT>=2.10.1
|
| 21 |
+
tenacity>=9.1.2
|