File size: 5,747 Bytes
977a5ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from __future__ import annotations
import os, time, random
from typing import Dict, Any, List, Literal, Optional
import httpx

HF_API_MODELS = "https://huggingface.co/api/models"
HF_API_DATASETS = "https://huggingface.co/api/datasets"
REFRESH_INTERVAL_SEC = int(os.getenv("HF_REGISTRY_REFRESH_SEC", "21600"))
HTTP_TIMEOUT = float(os.getenv("HF_HTTP_TIMEOUT", "8.0"))

# Curated Crypto Datasets
CRYPTO_DATASETS = {
    "price": [
        "paperswithbacktest/Cryptocurrencies-Daily-Price",
        "linxy/CryptoCoin",
        "sebdg/crypto_data",
        "Farmaanaa/bitcoin_price_timeseries",
        "WinkingFace/CryptoLM-Bitcoin-BTC-USDT",
        "WinkingFace/CryptoLM-Ethereum-ETH-USDT",
        "WinkingFace/CryptoLM-Ripple-XRP-USDT",
    ],
    "news_raw": [
        "flowfree/crypto-news-headlines",
        "edaschau/bitcoin_news",
    ],
    "news_labeled": [
        "SahandNZ/cryptonews-articles-with-price-momentum-labels",
        "tahamajs/bitcoin-individual-news-dataset",
        "tahamajs/bitcoin-enhanced-prediction-dataset-with-comprehensive-news",
        "tahamajs/bitcoin-prediction-dataset-with-local-news-summaries",
        "arad1367/Crypto_Semantic_News",
    ]
}

_SEED_MODELS = ["ElKulako/cryptobert", "kk08/CryptoBERT"]
_SEED_DATASETS = []
for cat in CRYPTO_DATASETS.values():
    _SEED_DATASETS.extend(cat)

class HFRegistry:
    def __init__(self):
        self.models: Dict[str, Dict[str, Any]] = {}
        self.datasets: Dict[str, Dict[str, Any]] = {}
        self.last_refresh = 0.0
        self.fail_reason: Optional[str] = None

    async def _hf_json(self, url: str, params: Dict[str, Any]) -> Any:
        async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client:
            r = await client.get(url, params=params)
            r.raise_for_status()
            return r.json()

    async def refresh(self) -> Dict[str, Any]:
        try:
            # Seed models
            for name in _SEED_MODELS:
                self.models.setdefault(name, {"id": name, "source": "seed", "pipeline_tag": "sentiment-analysis"})
            
            # Seed datasets with category metadata
            for category, dataset_list in CRYPTO_DATASETS.items():
                for name in dataset_list:
                    self.datasets.setdefault(name, {"id": name, "source": "seed", "category": category, "tags": ["crypto", category]})
            
            # Fetch from HF Hub
            q_sent = {"pipeline_tag": "sentiment-analysis", "search": "crypto", "limit": 50}
            models = await self._hf_json(HF_API_MODELS, q_sent)
            for m in models or []:
                mid = m.get("modelId") or m.get("id") or m.get("name")
                if not mid: continue
                self.models[mid] = {
                    "id": mid,
                    "pipeline_tag": m.get("pipeline_tag"),
                    "likes": m.get("likes"),
                    "downloads": m.get("downloads"),
                    "tags": m.get("tags") or [],
                    "source": "hub"
                }
            
            q_crypto = {"search": "crypto", "limit": 100}
            datasets = await self._hf_json(HF_API_DATASETS, q_crypto)
            for d in datasets or []:
                did = d.get("id") or d.get("name")
                if not did: continue
                # Infer category from tags or name
                category = "other"
                tags_str = " ".join(d.get("tags") or []).lower()
                name_lower = did.lower()
                if "price" in tags_str or "ohlc" in tags_str or "price" in name_lower:
                    category = "price"
                elif "news" in tags_str or "news" in name_lower:
                    if "label" in tags_str or "sentiment" in tags_str:
                        category = "news_labeled"
                    else:
                        category = "news_raw"
                
                self.datasets[did] = {
                    "id": did,
                    "likes": d.get("likes"),
                    "downloads": d.get("downloads"),
                    "tags": d.get("tags") or [],
                    "category": category,
                    "source": "hub"
                }
            
            self.last_refresh = time.time()
            self.fail_reason = None
            return {"ok": True, "models": len(self.models), "datasets": len(self.datasets)}
        except Exception as e:
            self.fail_reason = str(e)
            return {"ok": False, "error": self.fail_reason, "models": len(self.models), "datasets": len(self.datasets)}

    def list(self, kind: Literal["models","datasets"]="models", category: Optional[str]=None) -> List[Dict[str, Any]]:
        items = list(self.models.values()) if kind == "models" else list(self.datasets.values())
        if category and kind == "datasets":
            items = [d for d in items if d.get("category") == category]
        return items

    def health(self):
        age = time.time() - (self.last_refresh or 0)
        return {
            "ok": self.last_refresh > 0 and (self.fail_reason is None),
            "last_refresh_epoch": self.last_refresh,
            "age_sec": age,
            "fail_reason": self.fail_reason,
            "counts": {"models": len(self.models), "datasets": len(self.datasets)},
            "interval_sec": REFRESH_INTERVAL_SEC
        }

REGISTRY = HFRegistry()

async def periodic_refresh(loop_sleep: int = REFRESH_INTERVAL_SEC):
    await REGISTRY.refresh()
    await _sleep(int(loop_sleep * random.uniform(0.5, 0.9)))
    while True:
        await REGISTRY.refresh()
        await _sleep(loop_sleep)

async def _sleep(sec: int):
    import asyncio
    try:
        await asyncio.sleep(sec)
    except: pass