Spaces:

davanstrien
/

ocr-time-capsule

Running

File size: 11,408 Bytes

/**
 * HuggingFace Dataset Viewer API wrapper
 * Handles fetching data from the datasets-server API with caching and error handling
 */

class DatasetAPI {
    constructor() {
        this.baseURL = 'https://datasets-server.huggingface.co';
        this.cache = new Map();
        this.cacheExpiry = 45 * 60 * 1000; // 45 minutes (conservative for signed URLs)
        this.rowsPerFetch = 100; // API maximum
    }

    /**
     * Check if a dataset is valid and has viewer enabled
     */
    async validateDataset(datasetId) {
        try {
            const response = await fetch(`${this.baseURL}/is-valid?dataset=${encodeURIComponent(datasetId)}`);
            if (!response.ok) {
                throw new Error(`Failed to validate dataset: ${response.statusText}`);
            }
            const data = await response.json();
            
            if (!data.viewer) {
                throw new Error('Dataset viewer is not available for this dataset');
            }
            
            return true;
        } catch (error) {
            throw new Error(`Dataset validation failed: ${error.message}`);
        }
    }

    /**
     * Get dataset info including splits and configs
     */
    async getDatasetInfo(datasetId) {
        const cacheKey = `info_${datasetId}`;
        const cached = this.getFromCache(cacheKey);
        if (cached) return cached;

        try {
            const response = await fetch(`${this.baseURL}/splits?dataset=${encodeURIComponent(datasetId)}`);
            if (!response.ok) {
                throw new Error(`Failed to get dataset info: ${response.statusText}`);
            }
            const data = await response.json();
            
            // Extract the default config and split
            const defaultConfig = data.splits[0]?.config || 'default';
            const defaultSplit = data.splits.find(s => s.split === 'train')?.split || data.splits[0]?.split || 'train';
            
            const info = {
                configs: [...new Set(data.splits.map(s => s.config))],
                splits: [...new Set(data.splits.map(s => s.split))],
                defaultConfig,
                defaultSplit,
                raw: data
            };
            
            this.setCache(cacheKey, info);
            return info;
        } catch (error) {
            throw new Error(`Failed to get dataset info: ${error.message}`);
        }
    }

    /**
     * Get the total number of rows in a dataset
     */
    async getTotalRows(datasetId, config, split) {
        const cacheKey = `size_${datasetId}_${config}_${split}`;
        const cached = this.getFromCache(cacheKey);
        if (cached) return cached;

        try {
            // First try to get from the size endpoint
            const sizeResponse = await fetch(
                `${this.baseURL}/size?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
            );
            
            if (sizeResponse.ok) {
                const sizeData = await sizeResponse.json();
                // The API returns num_rows in size.config or size.splits[0]
                const size = sizeData.size?.config?.num_rows || 
                           sizeData.size?.splits?.[0]?.num_rows || 
                           0;
                this.setCache(cacheKey, size);
                return size;
            }

            // Fallback: get first rows and check num_rows_total
            const rowsResponse = await fetch(
                `${this.baseURL}/first-rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
            );
            
            if (!rowsResponse.ok) {
                throw new Error('Unable to determine dataset size');
            }
            
            const rowsData = await rowsResponse.json();
            const size = rowsData.num_rows_total || rowsData.rows?.length || 0;
            this.setCache(cacheKey, size);
            return size;
        } catch (error) {
            console.warn('Failed to get total rows:', error);
            return null;
        }
    }

    /**
     * Fetch rows from the dataset
     */
    async fetchRows(datasetId, config, split, offset, length = this.rowsPerFetch) {
        const cacheKey = `rows_${datasetId}_${config}_${split}_${offset}_${length}`;
        const cached = this.getFromCache(cacheKey);
        if (cached) return cached;

        try {
            const response = await fetch(
                `${this.baseURL}/rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}&offset=${offset}&length=${length}`
            );
            
            if (!response.ok) {
                if (response.status === 403) {
                    throw new Error('Access denied. This dataset may be private or gated.');
                }
                throw new Error(`Failed to fetch rows: ${response.statusText}`);
            }
            
            const data = await response.json();
            
            // Extract column information
            const columns = this.detectColumns(data.features, data.rows[0]?.row);
            
            const result = {
                rows: data.rows,
                features: data.features,
                columns,
                numRowsTotal: data.num_rows_total,
                partial: data.partial || false
            };
            
            this.setCache(cacheKey, result);
            return result;
        } catch (error) {
            throw new Error(`Failed to fetch rows: ${error.message}`);
        }
    }

    /**
     * Get a single row by index with smart batching
     */
    async getRow(datasetId, config, split, index) {
        // Calculate which batch this index falls into
        const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
        const batchData = await this.fetchRows(datasetId, config, split, batchStart, this.rowsPerFetch);
        
        const localIndex = index - batchStart;
        if (localIndex >= 0 && localIndex < batchData.rows.length) {
            return {
                row: batchData.rows[localIndex].row,
                columns: batchData.columns,
                numRowsTotal: batchData.numRowsTotal
            };
        }
        
        throw new Error(`Row ${index} not found`);
    }

    /**
     * Detect column names for image and text data
     */
    detectColumns(features, sampleRow) {
        let imageColumn = null;
        let originalTextColumn = null;
        let improvedTextColumn = null;
        let inferenceInfoColumn = null;

        // Try to detect from features first
        for (const feature of features || []) {
            const name = feature.name;
            const type = feature.type;
            
            // Detect image column
            if (type._type === 'Image' || type.dtype === 'image' || type.feature?._type === 'Image') {
                imageColumn = name;
            }
            
            // Detect text columns based on common patterns
            if (!originalTextColumn && ['text', 'ocr', 'original_text', 'original', 'ground_truth'].includes(name)) {
                originalTextColumn = name;
            }
            
            if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
                improvedTextColumn = name;
            }
            
            // Detect inference info column
            if (name === 'inference_info') {
                inferenceInfoColumn = name;
            }
        }

        // Fallback: detect from sample row
        if (sampleRow) {
            const keys = Object.keys(sampleRow);
            
            if (!imageColumn) {
                for (const key of keys) {
                    if (sampleRow[key]?.src && sampleRow[key]?.height !== undefined) {
                        imageColumn = key;
                        break;
                    }
                }
            }
            
            // Additional text column detection from row data
            if (!originalTextColumn) {
                const candidates = ['text', 'ocr', 'original_text', 'original'];
                originalTextColumn = keys.find(k => candidates.includes(k)) || null;
            }
            
            if (!improvedTextColumn) {
                const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
                improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
            }
            
            // Check for inference info in sample row
            if (!inferenceInfoColumn && keys.includes('inference_info')) {
                inferenceInfoColumn = 'inference_info';
            }
        }

        return {
            image: imageColumn,
            originalText: originalTextColumn,
            improvedText: improvedTextColumn,
            inferenceInfo: inferenceInfoColumn
        };
    }

    /**
     * Refresh expired image URL by re-fetching the row
     */
    async refreshImageUrl(datasetId, config, split, index) {
        // Clear cache for this specific row batch
        const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
        const cacheKey = `rows_${datasetId}_${config}_${split}_${batchStart}_${this.rowsPerFetch}`;
        this.cache.delete(cacheKey);
        
        // Re-fetch the row
        return await this.getRow(datasetId, config, split, index);
    }

    /**
     * Cache management utilities
     */
    getFromCache(key) {
        const cached = this.cache.get(key);
        if (!cached) return null;
        
        if (Date.now() - cached.timestamp > this.cacheExpiry) {
            this.cache.delete(key);
            return null;
        }
        
        return cached.data;
    }

    setCache(key, data) {
        this.cache.set(key, {
            data,
            timestamp: Date.now()
        });
    }

    clearCache() {
        this.cache.clear();
    }
    
    /**
     * Parse inference info JSON safely
     */
    parseInferenceInfo(inferenceInfoData) {
        if (!inferenceInfoData) return null;
        
        try {
            // Handle if it's already an object (some datasets might store it as object)
            if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
                return inferenceInfoData;
            }
            
            // Handle if it's a JSON string
            if (typeof inferenceInfoData === 'string') {
                const parsed = JSON.parse(inferenceInfoData);
                // If it's an array, take the first item
                if (Array.isArray(parsed) && parsed.length > 0) {
                    return parsed[0];
                }
                return parsed;
            }
            
            // Handle if it's already an array
            if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
                return inferenceInfoData[0];
            }
            
            return null;
        } catch (error) {
            console.warn('Failed to parse inference info:', error);
            return null;
        }
    }
}

// Export for use in other scripts
window.DatasetAPI = DatasetAPI;