|
|
|
|
|
""" |
|
|
HuggingFace Dataset Aggregator - Uses ALL Free HF Datasets |
|
|
Maximizes usage of all available free HuggingFace datasets for historical OHLCV data |
|
|
""" |
|
|
|
|
|
import httpx |
|
|
import logging |
|
|
import io |
|
|
import csv |
|
|
from typing import Dict, Any, List, Optional |
|
|
from datetime import datetime |
|
|
from fastapi import HTTPException |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class HFDatasetAggregator: |
|
|
""" |
|
|
Aggregates historical OHLCV data from ALL free HuggingFace datasets: |
|
|
- linxy/CryptoCoin (26 symbols x 7 timeframes = 182 CSVs) |
|
|
- WinkingFace/CryptoLM-Bitcoin-BTC-USDT |
|
|
- WinkingFace/CryptoLM-Ethereum-ETH-USDT |
|
|
- WinkingFace/CryptoLM-Solana-SOL-USDT |
|
|
- WinkingFace/CryptoLM-Ripple-XRP-USDT |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.timeout = 30.0 |
|
|
|
|
|
|
|
|
self.linxy_base_url = "https://huggingface.co/datasets/linxy/CryptoCoin/resolve/main" |
|
|
self.linxy_symbols = [ |
|
|
"BTC", "ETH", "BNB", "XRP", "ADA", "DOGE", "SOL", "TRX", "DOT", "MATIC", |
|
|
"LTC", "SHIB", "AVAX", "UNI", "LINK", "ATOM", "XLM", "ETC", "XMR", "BCH", |
|
|
"NEAR", "APT", "ARB", "OP", "FTM", "ALGO" |
|
|
] |
|
|
self.linxy_timeframes = ["1m", "5m", "15m", "30m", "1h", "4h", "1d"] |
|
|
|
|
|
|
|
|
self.winkingface_datasets = { |
|
|
"BTC": "https://huggingface.co/datasets/WinkingFace/CryptoLM-Bitcoin-BTC-USDT/resolve/main", |
|
|
"ETH": "https://huggingface.co/datasets/WinkingFace/CryptoLM-Ethereum-ETH-USDT/resolve/main", |
|
|
"SOL": "https://huggingface.co/datasets/WinkingFace/CryptoLM-Solana-SOL-USDT/resolve/main", |
|
|
"XRP": "https://huggingface.co/datasets/WinkingFace/CryptoLM-Ripple-XRP-USDT/resolve/main" |
|
|
} |
|
|
|
|
|
|
|
|
self._cache = {} |
|
|
self._cache_duration = 3600 |
|
|
|
|
|
async def get_ohlcv( |
|
|
self, |
|
|
symbol: str, |
|
|
timeframe: str = "1h", |
|
|
limit: int = 1000 |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Get OHLCV data from HuggingFace datasets with fallback |
|
|
""" |
|
|
symbol = symbol.upper().replace("USDT", "").replace("USD", "") |
|
|
|
|
|
|
|
|
if symbol in self.linxy_symbols and timeframe in self.linxy_timeframes: |
|
|
try: |
|
|
data = await self._get_linxy_ohlcv(symbol, timeframe, limit) |
|
|
if data: |
|
|
logger.info(f"✅ linxy/CryptoCoin: Fetched {len(data)} candles for {symbol}/{timeframe}") |
|
|
return data |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ linxy/CryptoCoin failed for {symbol}/{timeframe}: {e}") |
|
|
|
|
|
|
|
|
if symbol in self.winkingface_datasets: |
|
|
try: |
|
|
data = await self._get_winkingface_ohlcv(symbol, timeframe, limit) |
|
|
if data: |
|
|
logger.info(f"✅ WinkingFace: Fetched {len(data)} candles for {symbol}") |
|
|
return data |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ WinkingFace failed for {symbol}: {e}") |
|
|
|
|
|
raise HTTPException( |
|
|
status_code=404, |
|
|
detail=f"No HuggingFace dataset found for {symbol}/{timeframe}" |
|
|
) |
|
|
|
|
|
async def _get_linxy_ohlcv( |
|
|
self, |
|
|
symbol: str, |
|
|
timeframe: str, |
|
|
limit: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
"""Get OHLCV data from linxy/CryptoCoin dataset""" |
|
|
cache_key = f"linxy_{symbol}_{timeframe}" |
|
|
|
|
|
|
|
|
if cache_key in self._cache: |
|
|
cached_data, cached_time = self._cache[cache_key] |
|
|
if (datetime.utcnow().timestamp() - cached_time) < self._cache_duration: |
|
|
logger.info(f"✅ Returning cached data for {symbol}/{timeframe}") |
|
|
return cached_data[:limit] |
|
|
|
|
|
|
|
|
csv_filename = f"{symbol}_{timeframe}.csv" |
|
|
csv_url = f"{self.linxy_base_url}/{csv_filename}" |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
response = await client.get(csv_url) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
csv_content = response.text |
|
|
csv_reader = csv.DictReader(io.StringIO(csv_content)) |
|
|
|
|
|
ohlcv_data = [] |
|
|
for row in csv_reader: |
|
|
try: |
|
|
|
|
|
|
|
|
ohlcv_data.append({ |
|
|
"timestamp": int(row.get("timestamp", 0)), |
|
|
"open": float(row.get("open", 0)), |
|
|
"high": float(row.get("high", 0)), |
|
|
"low": float(row.get("low", 0)), |
|
|
"close": float(row.get("close", 0)), |
|
|
"volume": float(row.get("volume", 0)) |
|
|
}) |
|
|
except (ValueError, KeyError) as e: |
|
|
logger.warning(f"⚠️ Failed to parse row: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
ohlcv_data.sort(key=lambda x: x["timestamp"], reverse=True) |
|
|
|
|
|
|
|
|
self._cache[cache_key] = (ohlcv_data, datetime.utcnow().timestamp()) |
|
|
|
|
|
return ohlcv_data[:limit] |
|
|
|
|
|
async def _get_winkingface_ohlcv( |
|
|
self, |
|
|
symbol: str, |
|
|
timeframe: str, |
|
|
limit: int |
|
|
) -> List[Dict[str, Any]]: |
|
|
"""Get OHLCV data from WinkingFace datasets""" |
|
|
cache_key = f"winkingface_{symbol}_{timeframe}" |
|
|
|
|
|
|
|
|
if cache_key in self._cache: |
|
|
cached_data, cached_time = self._cache[cache_key] |
|
|
if (datetime.utcnow().timestamp() - cached_time) < self._cache_duration: |
|
|
logger.info(f"✅ Returning cached data for {symbol} (WinkingFace)") |
|
|
return cached_data[:limit] |
|
|
|
|
|
|
|
|
base_url = self.winkingface_datasets[symbol] |
|
|
|
|
|
|
|
|
possible_files = [ |
|
|
f"{symbol}USDT_{timeframe}.csv", |
|
|
f"data.csv", |
|
|
f"{symbol}USDT_1h.csv" |
|
|
] |
|
|
|
|
|
for csv_filename in possible_files: |
|
|
try: |
|
|
csv_url = f"{base_url}/{csv_filename}" |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
response = await client.get(csv_url) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
csv_content = response.text |
|
|
csv_reader = csv.DictReader(io.StringIO(csv_content)) |
|
|
|
|
|
ohlcv_data = [] |
|
|
for row in csv_reader: |
|
|
try: |
|
|
|
|
|
|
|
|
timestamp_key = None |
|
|
for key in ["timestamp", "time", "date", "unix"]: |
|
|
if key in row: |
|
|
timestamp_key = key |
|
|
break |
|
|
|
|
|
if not timestamp_key: |
|
|
continue |
|
|
|
|
|
ohlcv_data.append({ |
|
|
"timestamp": int(float(row.get(timestamp_key, 0))), |
|
|
"open": float(row.get("open", 0)), |
|
|
"high": float(row.get("high", 0)), |
|
|
"low": float(row.get("low", 0)), |
|
|
"close": float(row.get("close", 0)), |
|
|
"volume": float(row.get("volume", 0)) |
|
|
}) |
|
|
except (ValueError, KeyError) as e: |
|
|
logger.warning(f"⚠️ Failed to parse row: {e}") |
|
|
continue |
|
|
|
|
|
if ohlcv_data: |
|
|
|
|
|
ohlcv_data.sort(key=lambda x: x["timestamp"], reverse=True) |
|
|
|
|
|
|
|
|
self._cache[cache_key] = (ohlcv_data, datetime.utcnow().timestamp()) |
|
|
|
|
|
return ohlcv_data[:limit] |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ Failed to fetch {csv_filename}: {e}") |
|
|
continue |
|
|
|
|
|
raise Exception(f"No data found for {symbol} in WinkingFace datasets") |
|
|
|
|
|
async def get_available_symbols(self) -> Dict[str, List[str]]: |
|
|
""" |
|
|
Get list of available symbols from all datasets |
|
|
""" |
|
|
return { |
|
|
"linxy_cryptocoin": self.linxy_symbols, |
|
|
"winkingface": list(self.winkingface_datasets.keys()) |
|
|
} |
|
|
|
|
|
async def get_available_timeframes(self, symbol: str) -> List[str]: |
|
|
""" |
|
|
Get available timeframes for a specific symbol |
|
|
""" |
|
|
symbol = symbol.upper().replace("USDT", "").replace("USD", "") |
|
|
|
|
|
timeframes = [] |
|
|
|
|
|
|
|
|
if symbol in self.linxy_symbols: |
|
|
timeframes.extend(self.linxy_timeframes) |
|
|
|
|
|
|
|
|
if symbol in self.winkingface_datasets: |
|
|
timeframes.append("1h") |
|
|
|
|
|
return list(set(timeframes)) |
|
|
|
|
|
|
|
|
|
|
|
hf_dataset_aggregator = HFDatasetAggregator() |
|
|
|
|
|
__all__ = ["HFDatasetAggregator", "hf_dataset_aggregator"] |
|
|
|
|
|
|