|
|
|
|
|
""" |
|
|
Hugging Face Dataset Loader Service |
|
|
دسترسی به Datasetهای رایگان HuggingFace |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
from typing import Dict, List, Optional, Any, Union |
|
|
import logging |
|
|
import asyncio |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
DATASETS_AVAILABLE = True |
|
|
except ImportError: |
|
|
DATASETS_AVAILABLE = False |
|
|
logger.warning("datasets library not available. Install with: pip install datasets") |
|
|
|
|
|
|
|
|
class HFDatasetService: |
|
|
""" |
|
|
سرویس برای بارگذاری و استفاده از Datasetهای رایگان HF |
|
|
|
|
|
مزایا: |
|
|
- دسترسی رایگان به 100,000+ dataset |
|
|
- داده تاریخی کریپتو |
|
|
- داده اخبار و sentiment |
|
|
- بدون نیاز به API key (برای datasetهای public) |
|
|
""" |
|
|
|
|
|
|
|
|
CRYPTO_DATASETS = { |
|
|
"linxy/CryptoCoin": { |
|
|
"description": "182 فایل CSV با OHLCV برای 26 کریپتو", |
|
|
"symbols": ["BTC", "ETH", "BNB", "SOL", "ADA", "XRP", "DOT", "DOGE", |
|
|
"AVAX", "MATIC", "LINK", "UNI", "ATOM", "LTC", "XMR"], |
|
|
"timeframes": ["1m", "5m", "15m", "30m", "1h", "4h", "1d"], |
|
|
"columns": ["timestamp", "open", "high", "low", "close", "volume"], |
|
|
"date_range": "2017-present" |
|
|
}, |
|
|
"WinkingFace/CryptoLM-Bitcoin-BTC-USDT": { |
|
|
"description": "داده تاریخی Bitcoin با indicators", |
|
|
"symbols": ["BTC"], |
|
|
"timeframes": ["1h"], |
|
|
"columns": ["timestamp", "open", "high", "low", "close", "volume", "rsi", "macd"], |
|
|
"date_range": "2019-2023" |
|
|
}, |
|
|
"sebdg/crypto_data": { |
|
|
"description": "OHLCV + indicators برای 10 کریپتو", |
|
|
"symbols": ["BTC", "ETH", "BNB", "ADA", "DOT", "LINK", "UNI", "AVAX", "MATIC", "SOL"], |
|
|
"indicators": ["RSI", "MACD", "Bollinger Bands", "EMA", "SMA"], |
|
|
"timeframes": ["1h", "4h", "1d"], |
|
|
"date_range": "2020-present" |
|
|
} |
|
|
} |
|
|
|
|
|
NEWS_DATASETS = { |
|
|
"Kwaai/crypto-news": { |
|
|
"description": "اخبار کریپتو با sentiment labels", |
|
|
"size": "10,000+ news articles", |
|
|
"languages": ["en"], |
|
|
"date_range": "2020-2023" |
|
|
}, |
|
|
"jacopoteneggi/crypto-news": { |
|
|
"description": "اخبار روزانه کریپتو", |
|
|
"size": "50,000+ articles", |
|
|
"sources": ["CoinDesk", "CoinTelegraph", "Bitcoin Magazine"], |
|
|
"date_range": "2018-2023" |
|
|
} |
|
|
} |
|
|
|
|
|
def __init__(self): |
|
|
self.cache = {} |
|
|
self.cache_ttl = 3600 |
|
|
|
|
|
def is_available(self) -> bool: |
|
|
"""بررسی در دسترس بودن کتابخانه datasets""" |
|
|
return DATASETS_AVAILABLE |
|
|
|
|
|
async def load_crypto_ohlcv( |
|
|
self, |
|
|
symbol: str = "BTC", |
|
|
timeframe: str = "1h", |
|
|
limit: int = 1000, |
|
|
dataset_name: str = "linxy/CryptoCoin" |
|
|
) -> pd.DataFrame: |
|
|
""" |
|
|
بارگذاری OHLCV از Dataset |
|
|
|
|
|
Args: |
|
|
symbol: نماد کریپتو (BTC, ETH, ...) |
|
|
timeframe: بازه زمانی (1m, 5m, 1h, 1d, ...) |
|
|
limit: تعداد رکورد |
|
|
dataset_name: نام dataset |
|
|
|
|
|
Returns: |
|
|
DataFrame شامل OHLCV |
|
|
""" |
|
|
if not DATASETS_AVAILABLE: |
|
|
logger.error("datasets library not available") |
|
|
return pd.DataFrame() |
|
|
|
|
|
try: |
|
|
|
|
|
cache_key = f"{dataset_name}:{symbol}:{timeframe}:{limit}" |
|
|
|
|
|
|
|
|
if cache_key in self.cache: |
|
|
cached_data, cached_time = self.cache[cache_key] |
|
|
if (datetime.now() - cached_time).total_seconds() < self.cache_ttl: |
|
|
logger.info(f"Returning cached data for {cache_key}") |
|
|
return cached_data |
|
|
|
|
|
logger.info(f"Loading dataset {dataset_name} for {symbol}...") |
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset( |
|
|
dataset_name, |
|
|
split="train", |
|
|
streaming=True |
|
|
) |
|
|
|
|
|
|
|
|
records = [] |
|
|
count = 0 |
|
|
|
|
|
for record in dataset: |
|
|
|
|
|
if "symbol" in record: |
|
|
if record["symbol"].upper() != symbol.upper(): |
|
|
continue |
|
|
|
|
|
records.append(record) |
|
|
count += 1 |
|
|
|
|
|
if count >= limit: |
|
|
break |
|
|
|
|
|
df = pd.DataFrame(records) |
|
|
|
|
|
|
|
|
if not df.empty: |
|
|
|
|
|
if "timestamp" in df.columns: |
|
|
if df["timestamp"].dtype == "object": |
|
|
df["timestamp"] = pd.to_datetime(df["timestamp"]) |
|
|
|
|
|
|
|
|
if "timestamp" in df.columns: |
|
|
df = df.sort_values("timestamp", ascending=False) |
|
|
|
|
|
|
|
|
self.cache[cache_key] = (df, datetime.now()) |
|
|
|
|
|
logger.info(f"Loaded {len(df)} records for {symbol}") |
|
|
return df |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error loading dataset: {e}") |
|
|
return pd.DataFrame() |
|
|
|
|
|
async def load_crypto_news( |
|
|
self, |
|
|
limit: int = 100, |
|
|
dataset_name: str = "Kwaai/crypto-news" |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
بارگذاری اخبار کریپتو از Dataset |
|
|
|
|
|
Args: |
|
|
limit: تعداد خبر |
|
|
dataset_name: نام dataset |
|
|
|
|
|
Returns: |
|
|
لیست اخبار |
|
|
""" |
|
|
if not DATASETS_AVAILABLE: |
|
|
logger.error("datasets library not available") |
|
|
return [] |
|
|
|
|
|
try: |
|
|
logger.info(f"Loading news from {dataset_name}...") |
|
|
|
|
|
|
|
|
dataset = load_dataset( |
|
|
dataset_name, |
|
|
split="train", |
|
|
streaming=True |
|
|
) |
|
|
|
|
|
|
|
|
news_items = [] |
|
|
count = 0 |
|
|
|
|
|
for record in dataset: |
|
|
news_item = { |
|
|
"title": record.get("title", ""), |
|
|
"content": record.get("text", record.get("content", "")), |
|
|
"url": record.get("url", ""), |
|
|
"source": record.get("source", "HuggingFace Dataset"), |
|
|
"published_at": record.get("date", record.get("published_at", "")), |
|
|
"sentiment": record.get("sentiment", "neutral") |
|
|
} |
|
|
|
|
|
news_items.append(news_item) |
|
|
count += 1 |
|
|
|
|
|
if count >= limit: |
|
|
break |
|
|
|
|
|
logger.info(f"Loaded {len(news_items)} news articles") |
|
|
return news_items |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error loading news: {e}") |
|
|
return [] |
|
|
|
|
|
async def get_historical_prices( |
|
|
self, |
|
|
symbol: str, |
|
|
days: int = 30, |
|
|
timeframe: str = "1h" |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
دریافت قیمتهای تاریخی |
|
|
|
|
|
Args: |
|
|
symbol: نماد کریپتو |
|
|
days: تعداد روز گذشته |
|
|
timeframe: بازه زمانی |
|
|
|
|
|
Returns: |
|
|
Dict شامل داده قیمت و آمار |
|
|
""" |
|
|
|
|
|
records_per_day = { |
|
|
"1m": 1440, |
|
|
"5m": 288, |
|
|
"15m": 96, |
|
|
"30m": 48, |
|
|
"1h": 24, |
|
|
"4h": 6, |
|
|
"1d": 1 |
|
|
} |
|
|
|
|
|
limit = records_per_day.get(timeframe, 24) * days |
|
|
|
|
|
|
|
|
df = await self.load_crypto_ohlcv(symbol, timeframe, limit) |
|
|
|
|
|
if df.empty: |
|
|
return { |
|
|
"status": "error", |
|
|
"error": "No data available", |
|
|
"symbol": symbol |
|
|
} |
|
|
|
|
|
|
|
|
latest_close = float(df.iloc[0]["close"]) if "close" in df.columns else 0 |
|
|
earliest_close = float(df.iloc[-1]["close"]) if "close" in df.columns else 0 |
|
|
|
|
|
price_change = latest_close - earliest_close |
|
|
price_change_pct = (price_change / earliest_close * 100) if earliest_close > 0 else 0 |
|
|
|
|
|
high_price = float(df["high"].max()) if "high" in df.columns else 0 |
|
|
low_price = float(df["low"].min()) if "low" in df.columns else 0 |
|
|
avg_volume = float(df["volume"].mean()) if "volume" in df.columns else 0 |
|
|
|
|
|
return { |
|
|
"status": "success", |
|
|
"symbol": symbol, |
|
|
"timeframe": timeframe, |
|
|
"days": days, |
|
|
"records": len(df), |
|
|
"latest_price": latest_close, |
|
|
"price_change": price_change, |
|
|
"price_change_pct": price_change_pct, |
|
|
"high": high_price, |
|
|
"low": low_price, |
|
|
"avg_volume": avg_volume, |
|
|
"data": df.to_dict(orient="records")[:100], |
|
|
"source": "HuggingFace Dataset", |
|
|
"is_free": True |
|
|
} |
|
|
|
|
|
def get_available_datasets(self) -> Dict[str, Any]: |
|
|
""" |
|
|
لیست Datasetهای موجود |
|
|
""" |
|
|
return { |
|
|
"crypto_data": { |
|
|
"total": len(self.CRYPTO_DATASETS), |
|
|
"datasets": self.CRYPTO_DATASETS |
|
|
}, |
|
|
"news_data": { |
|
|
"total": len(self.NEWS_DATASETS), |
|
|
"datasets": self.NEWS_DATASETS |
|
|
}, |
|
|
"library_available": DATASETS_AVAILABLE, |
|
|
"installation": "pip install datasets" if not DATASETS_AVAILABLE else "✅ Installed" |
|
|
} |
|
|
|
|
|
def get_supported_symbols(self) -> List[str]: |
|
|
""" |
|
|
لیست نمادهای پشتیبانی شده |
|
|
""" |
|
|
symbols = set() |
|
|
for dataset_info in self.CRYPTO_DATASETS.values(): |
|
|
symbols.update(dataset_info.get("symbols", [])) |
|
|
return sorted(list(symbols)) |
|
|
|
|
|
def get_supported_timeframes(self) -> List[str]: |
|
|
""" |
|
|
لیست بازههای زمانی پشتیبانی شده |
|
|
""" |
|
|
timeframes = set() |
|
|
for dataset_info in self.CRYPTO_DATASETS.values(): |
|
|
timeframes.update(dataset_info.get("timeframes", [])) |
|
|
return sorted(list(timeframes)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def quick_price_data( |
|
|
symbol: str = "BTC", |
|
|
days: int = 7 |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
دریافت سریع داده قیمت |
|
|
|
|
|
Args: |
|
|
symbol: نماد کریپتو |
|
|
days: تعداد روز |
|
|
|
|
|
Returns: |
|
|
Dict شامل داده و آمار |
|
|
""" |
|
|
service = HFDatasetService() |
|
|
return await service.get_historical_prices(symbol, days) |
|
|
|
|
|
|
|
|
async def quick_crypto_news(limit: int = 10) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
دریافت سریع اخبار کریپتو |
|
|
|
|
|
Args: |
|
|
limit: تعداد خبر |
|
|
|
|
|
Returns: |
|
|
لیست اخبار |
|
|
""" |
|
|
service = HFDatasetService() |
|
|
return await service.load_crypto_news(limit) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
async def test_service(): |
|
|
"""تست سرویس""" |
|
|
print("🧪 Testing HF Dataset Service...") |
|
|
|
|
|
service = HFDatasetService() |
|
|
|
|
|
|
|
|
print(f"\n1️⃣ Library available: {service.is_available()}") |
|
|
|
|
|
if not service.is_available(): |
|
|
print(" ⚠️ Install with: pip install datasets") |
|
|
return |
|
|
|
|
|
|
|
|
print("\n2️⃣ Available Datasets:") |
|
|
datasets = service.get_available_datasets() |
|
|
print(f" Crypto datasets: {datasets['crypto_data']['total']}") |
|
|
print(f" News datasets: {datasets['news_data']['total']}") |
|
|
|
|
|
|
|
|
print("\n3️⃣ Supported Symbols:") |
|
|
symbols = service.get_supported_symbols() |
|
|
print(f" {', '.join(symbols[:10])}...") |
|
|
|
|
|
|
|
|
print("\n4️⃣ Loading BTC price data...") |
|
|
try: |
|
|
result = await service.get_historical_prices("BTC", days=7, timeframe="1h") |
|
|
if result["status"] == "success": |
|
|
print(f" ✅ Loaded {result['records']} records") |
|
|
print(f" Latest price: ${result['latest_price']:,.2f}") |
|
|
print(f" Change: {result['price_change_pct']:+.2f}%") |
|
|
print(f" High: ${result['high']:,.2f}") |
|
|
print(f" Low: ${result['low']:,.2f}") |
|
|
else: |
|
|
print(f" ❌ Error: {result.get('error')}") |
|
|
except Exception as e: |
|
|
print(f" ❌ Exception: {e}") |
|
|
|
|
|
|
|
|
print("\n5️⃣ Loading crypto news...") |
|
|
try: |
|
|
news = await service.load_crypto_news(limit=5) |
|
|
print(f" ✅ Loaded {len(news)} news articles") |
|
|
for i, article in enumerate(news[:3], 1): |
|
|
print(f" {i}. {article['title'][:60]}...") |
|
|
except Exception as e: |
|
|
print(f" ❌ Exception: {e}") |
|
|
|
|
|
print("\n✅ Testing complete!") |
|
|
|
|
|
import asyncio |
|
|
asyncio.run(test_service()) |
|
|
|