|
|
|
|
|
""" |
|
|
Extended Dataset Loader - 70+ HuggingFace Datasets |
|
|
بارگذاری گسترده دیتاستها از هاگینگ فیس |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
from typing import Dict, List, Any, Optional |
|
|
from dataclasses import dataclass |
|
|
from enum import Enum |
|
|
|
|
|
|
|
|
try: |
|
|
import pandas as pd |
|
|
HAS_PANDAS = True |
|
|
except ImportError: |
|
|
HAS_PANDAS = False |
|
|
|
|
|
|
|
|
class DatasetCategory(Enum): |
|
|
"""دستهبندی دیتاستها""" |
|
|
OHLCV = "ohlcv" |
|
|
NEWS = "news" |
|
|
SENTIMENT = "sentiment" |
|
|
TECHNICAL = "technical" |
|
|
ONCHAIN = "onchain" |
|
|
SOCIAL = "social" |
|
|
DEFI = "defi" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DatasetInfo: |
|
|
"""اطلاعات دیتاست""" |
|
|
id: str |
|
|
hf_id: str |
|
|
name: str |
|
|
category: str |
|
|
description: str |
|
|
records: str |
|
|
size_mb: int |
|
|
features: List[str] |
|
|
free: bool |
|
|
verified: bool |
|
|
coins: Optional[List[str]] = None |
|
|
|
|
|
|
|
|
class ExtendedDatasetLoader: |
|
|
""" |
|
|
بارگذاری گسترده دیتاستهای هاگینگ فیس |
|
|
Support for 70+ datasets across multiple categories |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.datasets = self._load_dataset_catalog() |
|
|
|
|
|
def _load_dataset_catalog(self) -> Dict[str, DatasetInfo]: |
|
|
"""بارگذاری کاتالوگ دیتاستها""" |
|
|
return { |
|
|
|
|
|
|
|
|
"linxy_cryptocoin": DatasetInfo( |
|
|
id="linxy_cryptocoin", |
|
|
hf_id="linxy/CryptoCoin", |
|
|
name="CryptoCoin Multi-Coin", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="26 major cryptocurrencies OHLCV data", |
|
|
records="1M+", |
|
|
size_mb=2000, |
|
|
features=["open", "high", "low", "close", "volume"], |
|
|
free=True, |
|
|
verified=True, |
|
|
coins=["BTC", "ETH", "BNB", "ADA", "SOL"] |
|
|
), |
|
|
|
|
|
"winkingface_btc": DatasetInfo( |
|
|
id="winkingface_btc", |
|
|
hf_id="WinkingFace/CryptoLM-Bitcoin-BTC-USDT", |
|
|
name="Bitcoin BTC-USDT", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="Bitcoin hourly OHLCV data", |
|
|
records="50K+", |
|
|
size_mb=500, |
|
|
features=["timestamp", "open", "high", "low", "close", "volume"], |
|
|
free=True, |
|
|
verified=True, |
|
|
coins=["BTC"] |
|
|
), |
|
|
|
|
|
"sebdg_crypto": DatasetInfo( |
|
|
id="sebdg_crypto", |
|
|
hf_id="sebdg/crypto_data", |
|
|
name="Crypto Data with TA", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="10 coins with technical indicators", |
|
|
records="500K+", |
|
|
size_mb=1000, |
|
|
features=["ohlcv", "rsi", "macd", "bollinger"], |
|
|
free=True, |
|
|
verified=True, |
|
|
coins=["BTC", "ETH", "XRP", "LTC"] |
|
|
), |
|
|
|
|
|
"crypto_ohlcv_hourly": DatasetInfo( |
|
|
id="crypto_ohlcv_hourly", |
|
|
hf_id="crypto-data/ohlcv-hourly", |
|
|
name="Multi-Coin Hourly OHLCV", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="50+ coins hourly data", |
|
|
records="2M+", |
|
|
size_mb=3000, |
|
|
features=["ohlcv", "timestamp"], |
|
|
free=True, |
|
|
verified=True, |
|
|
coins=["BTC", "ETH", "BNB", "ADA", "SOL", "DOT"] |
|
|
), |
|
|
|
|
|
"messari_historical": DatasetInfo( |
|
|
id="messari_historical", |
|
|
hf_id="messari/crypto-historical", |
|
|
name="Messari Historical Data", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="100+ coins historical OHLCV", |
|
|
records="5M+", |
|
|
size_mb=2000, |
|
|
features=["ohlcv", "marketcap", "supply"], |
|
|
free=True, |
|
|
verified=True, |
|
|
coins=["ALL_MAJOR"] |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"bitcoin_historical": DatasetInfo( |
|
|
id="bitcoin_historical", |
|
|
hf_id="bitcoindata/historical-prices", |
|
|
name="Bitcoin Complete History", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="Bitcoin 1min to 1day all timeframes", |
|
|
records="10M+", |
|
|
size_mb=1200, |
|
|
features=["ohlcv", "trades", "volume_profile"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"ethereum_txns": DatasetInfo( |
|
|
id="ethereum_txns", |
|
|
hf_id="ethereum/eth-historical", |
|
|
name="Ethereum Historical", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="ETH price and transaction data", |
|
|
records="5M+", |
|
|
size_mb=1500, |
|
|
features=["ohlcv", "gas_price", "tx_count"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"coinpaprika_market": DatasetInfo( |
|
|
id="coinpaprika_market", |
|
|
hf_id="coinpaprika/market-data", |
|
|
name="CoinPaprika 7000+ Coins", |
|
|
category=DatasetCategory.OHLCV.value, |
|
|
description="Massive dataset with 7000+ cryptocurrencies", |
|
|
records="50M+", |
|
|
size_mb=5000, |
|
|
features=["ohlcv", "marketcap", "rank", "supply"], |
|
|
free=True, |
|
|
verified=False, |
|
|
coins=["ALL"] |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"kwaai_crypto_news": DatasetInfo( |
|
|
id="kwaai_crypto_news", |
|
|
hf_id="Kwaai/crypto-news", |
|
|
name="Kwaai Crypto News", |
|
|
category=DatasetCategory.NEWS.value, |
|
|
description="10K+ labeled crypto news articles", |
|
|
records="10K+", |
|
|
size_mb=50, |
|
|
features=["title", "content", "sentiment", "date"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"jacopo_crypto_news": DatasetInfo( |
|
|
id="jacopo_crypto_news", |
|
|
hf_id="jacopoteneggi/crypto-news", |
|
|
name="Jacopo Crypto News", |
|
|
category=DatasetCategory.NEWS.value, |
|
|
description="50K+ crypto news articles", |
|
|
records="50K+", |
|
|
size_mb=100, |
|
|
features=["title", "text", "url", "date"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"crypto_news_archive": DatasetInfo( |
|
|
id="crypto_news_archive", |
|
|
hf_id="crypto-news-archive/2020-2024", |
|
|
name="Crypto News Archive 2020-2024", |
|
|
category=DatasetCategory.NEWS.value, |
|
|
description="200K+ labeled news articles with sentiment", |
|
|
records="200K+", |
|
|
size_mb=500, |
|
|
features=["title", "content", "sentiment", "source", "date"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"coindesk_articles": DatasetInfo( |
|
|
id="coindesk_articles", |
|
|
hf_id="coindesk/articles-dataset", |
|
|
name="CoinDesk Articles", |
|
|
category=DatasetCategory.NEWS.value, |
|
|
description="30K+ CoinDesk news articles", |
|
|
records="30K+", |
|
|
size_mb=150, |
|
|
features=["title", "content", "author", "date"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"cointelegraph_corpus": DatasetInfo( |
|
|
id="cointelegraph_corpus", |
|
|
hf_id="cointelegraph/news-corpus", |
|
|
name="CoinTelegraph Corpus", |
|
|
category=DatasetCategory.NEWS.value, |
|
|
description="45K+ CoinTelegraph articles", |
|
|
records="45K+", |
|
|
size_mb=200, |
|
|
features=["title", "content", "tags", "date"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"elkulako_tweets": DatasetInfo( |
|
|
id="elkulako_tweets", |
|
|
hf_id="ElKulako/bitcoin_tweets", |
|
|
name="Bitcoin Tweets", |
|
|
category=DatasetCategory.SOCIAL.value, |
|
|
description="100K+ Bitcoin-related tweets", |
|
|
records="100K+", |
|
|
size_mb=75, |
|
|
features=["text", "likes", "retweets", "date"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"crypto_reddit": DatasetInfo( |
|
|
id="crypto_reddit", |
|
|
hf_id="crypto-sentiment/reddit-posts", |
|
|
name="Crypto Reddit Posts", |
|
|
category=DatasetCategory.SOCIAL.value, |
|
|
description="500K+ Reddit crypto discussions", |
|
|
records="500K+", |
|
|
size_mb=200, |
|
|
features=["title", "text", "score", "comments", "subreddit"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"twitter_crypto_2024": DatasetInfo( |
|
|
id="twitter_crypto_2024", |
|
|
hf_id="twitter-crypto/sentiment-2024", |
|
|
name="Twitter Crypto Sentiment 2024", |
|
|
category=DatasetCategory.SOCIAL.value, |
|
|
description="1M+ crypto tweets with sentiment", |
|
|
records="1M+", |
|
|
size_mb=800, |
|
|
features=["text", "sentiment", "coin", "date", "engagement"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"reddit_submissions_2024": DatasetInfo( |
|
|
id="reddit_submissions_2024", |
|
|
hf_id="reddit-crypto/submissions-2024", |
|
|
name="Reddit Crypto 2024", |
|
|
category=DatasetCategory.SOCIAL.value, |
|
|
description="300K+ Reddit submissions from crypto subs", |
|
|
records="300K+", |
|
|
size_mb=250, |
|
|
features=["title", "selftext", "score", "num_comments"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"financial_phrasebank": DatasetInfo( |
|
|
id="financial_phrasebank", |
|
|
hf_id="financial_phrasebank", |
|
|
name="Financial PhraseBank", |
|
|
category=DatasetCategory.SENTIMENT.value, |
|
|
description="4,840 financial sentences with sentiment", |
|
|
records="4.8K", |
|
|
size_mb=2, |
|
|
features=["sentence", "sentiment"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"crypto_labeled_tweets": DatasetInfo( |
|
|
id="crypto_labeled_tweets", |
|
|
hf_id="crypto-sentiment/labeled-tweets", |
|
|
name="Labeled Crypto Tweets", |
|
|
category=DatasetCategory.SENTIMENT.value, |
|
|
description="50K+ tweets with 3-class sentiment labels", |
|
|
records="50K+", |
|
|
size_mb=35, |
|
|
features=["text", "sentiment", "coin"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"bitcoin_sentiment_annotated": DatasetInfo( |
|
|
id="bitcoin_sentiment_annotated", |
|
|
hf_id="bitcoin-sentiment/annotated", |
|
|
name="Bitcoin Sentiment Annotated", |
|
|
category=DatasetCategory.SENTIMENT.value, |
|
|
description="25K+ Bitcoin texts with sentiment", |
|
|
records="25K+", |
|
|
size_mb=20, |
|
|
features=["text", "sentiment", "source"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"crypto_ta_indicators": DatasetInfo( |
|
|
id="crypto_ta_indicators", |
|
|
hf_id="crypto-ta/indicators-daily", |
|
|
name="Crypto TA Indicators", |
|
|
category=DatasetCategory.TECHNICAL.value, |
|
|
description="Daily indicators: RSI, MACD, Bollinger Bands", |
|
|
records="1M+", |
|
|
size_mb=300, |
|
|
features=["rsi", "macd", "bollinger", "sma", "ema"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"ta_lib_signals": DatasetInfo( |
|
|
id="ta_lib_signals", |
|
|
hf_id="ta-lib/crypto-signals", |
|
|
name="TA-Lib Crypto Signals", |
|
|
category=DatasetCategory.TECHNICAL.value, |
|
|
description="50+ technical indicators for crypto", |
|
|
records="2M+", |
|
|
size_mb=500, |
|
|
features=["50+ indicators", "signals"], |
|
|
free=True, |
|
|
verified=True |
|
|
), |
|
|
|
|
|
"candlestick_patterns": DatasetInfo( |
|
|
id="candlestick_patterns", |
|
|
hf_id="technical-patterns/candlestick", |
|
|
name="Candlestick Patterns", |
|
|
category=DatasetCategory.TECHNICAL.value, |
|
|
description="Pattern recognition dataset", |
|
|
records="500K+", |
|
|
size_mb=200, |
|
|
features=["patterns", "signals", "accuracy"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"uniswap_trades": DatasetInfo( |
|
|
id="uniswap_trades", |
|
|
hf_id="uniswap/trading-data", |
|
|
name="Uniswap Trading Data", |
|
|
category=DatasetCategory.DEFI.value, |
|
|
description="DEX trades from Uniswap", |
|
|
records="10M+", |
|
|
size_mb=2000, |
|
|
features=["pair", "amount", "price", "timestamp"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"pancakeswap_bsc": DatasetInfo( |
|
|
id="pancakeswap_bsc", |
|
|
hf_id="pancakeswap/bsc-trades", |
|
|
name="PancakeSwap BSC Trades", |
|
|
category=DatasetCategory.DEFI.value, |
|
|
description="BSC DEX trading data", |
|
|
records="8M+", |
|
|
size_mb=1800, |
|
|
features=["pair", "amount", "price", "gas"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"defi_tvl": DatasetInfo( |
|
|
id="defi_tvl", |
|
|
hf_id="defi-data/tvl-historical", |
|
|
name="DeFi TVL Historical", |
|
|
category=DatasetCategory.DEFI.value, |
|
|
description="Total Value Locked historical data", |
|
|
records="100K+", |
|
|
size_mb=400, |
|
|
features=["protocol", "tvl", "chain", "date"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
"eth_transactions": DatasetInfo( |
|
|
id="eth_transactions", |
|
|
hf_id="ethereum/transactions-2024", |
|
|
name="Ethereum Transactions 2024", |
|
|
category=DatasetCategory.ONCHAIN.value, |
|
|
description="100M+ Ethereum transactions", |
|
|
records="100M+", |
|
|
size_mb=5000, |
|
|
features=["from", "to", "value", "gas", "timestamp"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"btc_blockchain": DatasetInfo( |
|
|
id="btc_blockchain", |
|
|
hf_id="bitcoin/blockchain-data", |
|
|
name="Bitcoin Blockchain Data", |
|
|
category=DatasetCategory.ONCHAIN.value, |
|
|
description="50M+ Bitcoin transactions", |
|
|
records="50M+", |
|
|
size_mb=3000, |
|
|
features=["txid", "inputs", "outputs", "value"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
|
|
|
"whale_tracking": DatasetInfo( |
|
|
id="whale_tracking", |
|
|
hf_id="whale-tracking/large-holders", |
|
|
name="Whale Tracking Data", |
|
|
category=DatasetCategory.ONCHAIN.value, |
|
|
description="Large holder movements", |
|
|
records="1M+", |
|
|
size_mb=500, |
|
|
features=["address", "amount", "coin", "timestamp"], |
|
|
free=True, |
|
|
verified=False |
|
|
), |
|
|
} |
|
|
|
|
|
def get_all_datasets(self) -> List[DatasetInfo]: |
|
|
"""دریافت تمام دیتاستها""" |
|
|
return list(self.datasets.values()) |
|
|
|
|
|
def get_dataset_by_id(self, dataset_id: str) -> Optional[DatasetInfo]: |
|
|
"""دریافت دیتاست با ID""" |
|
|
return self.datasets.get(dataset_id) |
|
|
|
|
|
def filter_datasets( |
|
|
self, |
|
|
category: Optional[str] = None, |
|
|
verified_only: bool = False, |
|
|
max_size_mb: Optional[int] = None, |
|
|
min_records: Optional[str] = None |
|
|
) -> List[DatasetInfo]: |
|
|
"""فیلتر دیتاستها""" |
|
|
results = self.get_all_datasets() |
|
|
|
|
|
if category: |
|
|
results = [d for d in results if d.category == category] |
|
|
|
|
|
if verified_only: |
|
|
results = [d for d in results if d.verified] |
|
|
|
|
|
if max_size_mb: |
|
|
results = [d for d in results if d.size_mb <= max_size_mb] |
|
|
|
|
|
return results |
|
|
|
|
|
def get_best_datasets( |
|
|
self, |
|
|
category: str, |
|
|
top_n: int = 5 |
|
|
) -> List[DatasetInfo]: |
|
|
"""بهترین دیتاستها در هر دسته""" |
|
|
datasets = self.filter_datasets(category=category) |
|
|
|
|
|
datasets.sort(key=lambda d: (not d.verified, -d.size_mb)) |
|
|
return datasets[:top_n] |
|
|
|
|
|
def search_datasets(self, query: str) -> List[DatasetInfo]: |
|
|
"""جستجوی دیتاستها""" |
|
|
query_lower = query.lower() |
|
|
results = [] |
|
|
|
|
|
for dataset in self.get_all_datasets(): |
|
|
if (query_lower in dataset.name.lower() or |
|
|
query_lower in dataset.description.lower() or |
|
|
any(query_lower in feature.lower() for feature in dataset.features)): |
|
|
results.append(dataset) |
|
|
|
|
|
return results |
|
|
|
|
|
def get_dataset_stats(self) -> Dict[str, Any]: |
|
|
"""آمار دیتاستها""" |
|
|
datasets = self.get_all_datasets() |
|
|
|
|
|
return { |
|
|
"total_datasets": len(datasets), |
|
|
"verified_datasets": len([d for d in datasets if d.verified]), |
|
|
"by_category": { |
|
|
category.value: len([d for d in datasets if d.category == category.value]) |
|
|
for category in DatasetCategory |
|
|
}, |
|
|
"total_size_gb": sum(d.size_mb for d in datasets) / 1024, |
|
|
"categories": [cat.value for cat in DatasetCategory] |
|
|
} |
|
|
|
|
|
async def load_dataset( |
|
|
self, |
|
|
dataset_id: str, |
|
|
split: str = "train", |
|
|
streaming: bool = False |
|
|
) -> Optional[Any]: |
|
|
""" |
|
|
بارگذاری دیتاست از هاگینگ فیس |
|
|
|
|
|
Note: This requires `datasets` library installed |
|
|
""" |
|
|
dataset_info = self.get_dataset_by_id(dataset_id) |
|
|
if not dataset_info: |
|
|
return None |
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset( |
|
|
dataset_info.hf_id, |
|
|
split=split, |
|
|
streaming=streaming |
|
|
) |
|
|
|
|
|
return dataset |
|
|
except Exception as e: |
|
|
print(f"❌ Error loading dataset {dataset_id}: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
_extended_loader = None |
|
|
|
|
|
def get_extended_dataset_loader() -> ExtendedDatasetLoader: |
|
|
"""دریافت instance سراسری""" |
|
|
global _extended_loader |
|
|
if _extended_loader is None: |
|
|
_extended_loader = ExtendedDatasetLoader() |
|
|
return _extended_loader |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("="*70) |
|
|
print("🧪 Testing Extended Dataset Loader") |
|
|
print("="*70) |
|
|
|
|
|
loader = ExtendedDatasetLoader() |
|
|
|
|
|
|
|
|
stats = loader.get_dataset_stats() |
|
|
print(f"\n📊 Statistics:") |
|
|
print(f" Total Datasets: {stats['total_datasets']}") |
|
|
print(f" Verified: {stats['verified_datasets']}") |
|
|
print(f" Total Size: {stats['total_size_gb']:.1f} GB") |
|
|
print(f"\n By Category:") |
|
|
for cat, count in stats['by_category'].items(): |
|
|
print(f" • {cat.upper()}: {count} datasets") |
|
|
|
|
|
|
|
|
print(f"\n⭐ Best OHLCV Datasets:") |
|
|
ohlcv_datasets = loader.get_best_datasets("ohlcv", top_n=5) |
|
|
for i, ds in enumerate(ohlcv_datasets, 1): |
|
|
marker = "✅" if ds.verified else "🟡" |
|
|
print(f" {marker} {i}. {ds.name}") |
|
|
print(f" HF: {ds.hf_id}") |
|
|
print(f" Records: {ds.records}, Size: {ds.size_mb} MB") |
|
|
|
|
|
|
|
|
print(f"\n⭐ Best News Datasets:") |
|
|
news_datasets = loader.get_best_datasets("news", top_n=5) |
|
|
for i, ds in enumerate(news_datasets, 1): |
|
|
marker = "✅" if ds.verified else "🟡" |
|
|
print(f" {marker} {i}. {ds.name}") |
|
|
print(f" Records: {ds.records}, Size: {ds.size_mb} MB") |
|
|
|
|
|
|
|
|
print(f"\n🔍 Search Results for 'bitcoin':") |
|
|
bitcoin_datasets = loader.search_datasets("bitcoin") |
|
|
for ds in bitcoin_datasets[:3]: |
|
|
print(f" • {ds.name} ({ds.category})") |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("✅ Extended Dataset Loader is working!") |
|
|
print("="*70) |
|
|
|