/** * Main Alpine.js application for OCR Text Explorer */ document.addEventListener('alpine:init', () => { Alpine.data('ocrExplorer', () => ({ // Dataset state datasetId: 'davanstrien/exams-ocr', datasetConfig: 'default', datasetSplit: 'train', // Example datasets exampleDatasets: [ { id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' }, { id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }, { id: 'davanstrien/india-medical-ocr-test', name: 'India Medical OCR', description: 'Medical documents with NuMarkdown reasoning traces' } ], // Navigation state currentIndex: 0, totalSamples: null, currentSample: null, jumpToPage: '', // UI state loading: false, error: null, activeTab: 'comparison', diffMode: 'char', darkMode: false, showAbout: false, showFlowView: false, showDock: false, renderMarkdown: false, hasMarkdown: false, showShareSuccess: false, // Reasoning trace state hasReasoningTrace: false, showReasoning: false, reasoningContent: null, answerContent: null, reasoningStats: null, formattedReasoning: null, // Flow view state flowItems: [], flowStartIndex: 0, flowVisibleCount: 7, flowOffset: 0, // Dock state dockItems: [], dockHideTimeout: null, dockStartIndex: 0, dockVisibleCount: 10, // Computed diff HTML diffHtml: '', // Statistics similarity: 0, charStats: { total: 0, added: 0, removed: 0 }, wordStats: { original: 0, improved: 0 }, // API instance api: null, // Markdown cache markdownCache: new Map(), // Model info modelInfo: null, columnInfo: null, async init() { // Initialize API this.api = new DatasetAPI(); // Read URL parameters for deep linking const urlParams = new URLSearchParams(window.location.search); const urlDataset = urlParams.get('dataset'); const urlIndex = urlParams.get('index'); const urlView = urlParams.get('view'); const urlDiff = urlParams.get('diff'); const urlMarkdown = urlParams.get('markdown'); const urlReasoning = urlParams.get('reasoning'); // Apply URL parameters if present if (urlDataset) { this.datasetId = urlDataset; } if (urlView && ['comparison', 'diff', 'improved'].includes(urlView)) { this.activeTab = urlView; } if (urlDiff && ['char', 'word', 'line', 'markdown'].includes(urlDiff)) { this.diffMode = urlDiff; } if (urlMarkdown !== null) { this.renderMarkdown = urlMarkdown === 'true'; } if (urlReasoning !== null) { this.showReasoning = urlReasoning === 'true'; } // Apply dark mode from localStorage this.darkMode = localStorage.getItem('darkMode') === 'true'; this.$watch('darkMode', value => { localStorage.setItem('darkMode', value); document.documentElement.classList.toggle('dark', value); }); document.documentElement.classList.toggle('dark', this.darkMode); // Setup keyboard navigation this.setupKeyboardNavigation(); // Setup watchers for URL updates this.initWatchers(); // Check if we have a specific index to load const hasUrlIndex = urlIndex !== null; // Load initial dataset (skip initial load if we have a URL index) await this.loadDataset(hasUrlIndex); // Jump to specific index if provided in URL if (hasUrlIndex) { const index = parseInt(urlIndex); if (!isNaN(index) && index >= 0 && index < this.totalSamples) { await this.loadSample(index); } else { // If invalid index, load the first sample await this.loadSample(0); } } }, setupKeyboardNavigation() { document.addEventListener('keydown', (e) => { // Ignore if user is typing in input if (e.target.tagName === 'INPUT') return; switch(e.key) { case 'ArrowLeft': e.preventDefault(); if (e.shiftKey && this.showDock) { this.scrollDockLeft(); } else { this.previousSample(); } break; case 'ArrowRight': e.preventDefault(); if (e.shiftKey && this.showDock) { this.scrollDockRight(); } else { this.nextSample(); } break; case 'k': case 'K': e.preventDefault(); this.previousSample(); break; case 'j': case 'J': e.preventDefault(); this.nextSample(); break; case '1': this.activeTab = 'comparison'; break; case '2': this.activeTab = 'diff'; break; case '3': this.activeTab = 'improved'; break; case 'v': case 'V': // Toggle dock with V key if (this.showDock) { this.hideDockPreview(); } else { this.showDockPreview(); } break; } }); }, async loadDataset(skipInitialLoad = false) { this.loading = true; this.error = null; // Clear markdown cache when loading new dataset this.markdownCache.clear(); try { // Validate dataset await this.api.validateDataset(this.datasetId); // Get dataset info const info = await this.api.getDatasetInfo(this.datasetId); this.datasetConfig = info.defaultConfig; this.datasetSplit = info.defaultSplit; // Get total rows this.totalSamples = await this.api.getTotalRows( this.datasetId, this.datasetConfig, this.datasetSplit ); // Load first sample only if not skipping if (!skipInitialLoad) { this.currentIndex = 0; await this.loadSample(0); } } catch (error) { this.error = error.message; } finally { this.loading = false; } }, async loadSample(index) { try { const data = await this.api.getRow( this.datasetId, this.datasetConfig, this.datasetSplit, index ); this.currentSample = data.row; this.currentIndex = index; this.columnInfo = data.columns; // Extract model info if available this.extractModelInfo(); // Debug: Log column info console.log('Column info:', this.columnInfo); console.log('Current sample keys:', Object.keys(this.currentSample)); // Check if improved text contains markdown and reasoning traces const improvedText = this.getImprovedText(); this.parseReasoningTrace(improvedText); this.hasMarkdown = this.detectMarkdown(this.answerContent || improvedText); // Update diff when sample changes this.updateDiff(); // Update URL without triggering navigation this.updateURL(); } catch (error) { this.error = `Failed to load sample: ${error.message}`; } }, async nextSample() { if (this.currentIndex < this.totalSamples - 1) { await this.loadSample(this.currentIndex + 1); } }, async previousSample() { if (this.currentIndex > 0) { await this.loadSample(this.currentIndex - 1); } }, async jumpToSample() { const pageNum = parseInt(this.jumpToPage); if (!isNaN(pageNum) && pageNum >= 1 && pageNum <= this.totalSamples) { // Convert 1-based page number to 0-based index await this.loadSample(pageNum - 1); // Clear the input after jumping this.jumpToPage = ''; } else { // Show error or just reset this.jumpToPage = ''; } }, async selectDataset(datasetId) { this.datasetId = datasetId; await this.loadDataset(); }, extractModelInfo() { this.modelInfo = null; if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) { console.log('No inference info column detected'); return; } const inferenceData = this.currentSample[this.columnInfo.inferenceInfo]; if (!inferenceData) { console.log('No inference data in current sample'); return; } console.log('Raw inference data:', inferenceData); const parsed = this.api.parseInferenceInfo(inferenceData); console.log('Parsed inference data:', parsed); if (parsed) { const formattedInfo = this.formatModelInfo(parsed); // Ensure it's a plain object, not a proxy this.modelInfo = formattedInfo ? {...formattedInfo} : null; console.log('Formatted model info:', this.modelInfo); } }, formatModelInfo(info) { if (!info) return null; return { modelId: info.model_id || 'Unknown', modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown', processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null, scriptVersion: info.script_version || null, batchSize: info.batch_size || null, maxTokens: info.max_tokens || null, scriptUrl: info.script_url || null, columnName: info.column_name || null }; }, parseReasoningTrace(text) { // Reset reasoning state this.hasReasoningTrace = false; this.reasoningContent = null; this.answerContent = null; this.reasoningStats = null; this.formattedReasoning = null; if (!text || !window.ReasoningParser) return; // Check if text contains reasoning trace if (ReasoningParser.detectReasoningTrace(text)) { const parsed = ReasoningParser.parseReasoningContent(text); if (parsed.hasReasoning) { this.hasReasoningTrace = true; this.reasoningContent = parsed.reasoning; this.answerContent = parsed.answer; this.formattedReasoning = ReasoningParser.formatReasoningSteps(parsed.reasoning); this.reasoningStats = ReasoningParser.getReasoningStats(parsed); console.log('Reasoning trace detected:', this.reasoningStats); } else { // No reasoning found, use original text as answer this.answerContent = text; } } else { // No reasoning markers, use original text this.answerContent = text; } }, getOriginalText() { if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); return this.currentSample[columns.originalText] || 'No original text found'; }, getImprovedText() { if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); const rawText = this.currentSample[columns.improvedText] || 'No improved text found'; // If we have parsed answer content from reasoning trace, use that // Otherwise return the raw text return this.hasReasoningTrace && this.answerContent ? this.answerContent : rawText; }, getRawImprovedText() { // Get the raw improved text without parsing reasoning traces if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); return this.currentSample[columns.improvedText] || 'No improved text found'; }, detectMarkdown(text) { // Check for common markdown patterns const markdownPatterns = [ /^#{1,6}\s/m, // Headers /\*\*[^*]+\*\*/, // Bold /\*[^*]+\*/, // Italic /\[[^\]]+\]\([^)]+\)/, // Links /^[-*+]\s/m, // Lists /^\d+\.\s/m, // Numbered lists /^>/m, // Blockquotes /```[\s\S]*?```/, // Code blocks /`[^`]+`/, // Inline code /\|.*\|.*\|/m, // Tables (basic detection) //i, // HTML tables //i // HTML table headers ]; return markdownPatterns.some(pattern => pattern.test(text)); }, escapeHtml(text) { const div = document.createElement('div'); div.textContent = text; return div.innerHTML; }, renderMarkdownText(text) { if (!text || !this.renderMarkdown) return text; // Check cache first const cacheKey = `${this.currentIndex}_${text.substring(0, 100)}`; if (this.markdownCache.has(cacheKey)) { return this.markdownCache.get(cacheKey); } try { // Configure marked options for security const renderer = new marked.Renderer(); // Override link rendering to open in new tab and sanitize const self = this; renderer.link = function(href, title, text) { // Basic URL sanitization const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, ''); const safeTitle = (title || '').replace(/"/g, '"'); const safeText = self.escapeHtml(text); return `${safeText}`; }; // Override image rendering for safety renderer.image = function(href, title, text) { const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, ''); const safeTitle = (title || '').replace(/"/g, '"'); const safeAlt = self.escapeHtml(text); return `${safeAlt}`; }; // Override HTML rendering to prevent XSS but allow safe table elements renderer.html = function(html) { // Allow specific safe HTML tags for tables const allowedTags = ['table', 'thead', 'tbody', 'tr', 'th', 'td']; const tagPattern = new RegExp(`]*)?>`, 'gi'); // Check if the HTML contains only allowed tags const strippedHtml = html.replace(tagPattern, ''); const hasDisallowedTags = /<[^>]+>/.test(strippedHtml); if (!hasDisallowedTags) { // Return the HTML if it only contains allowed table tags return html; } // Strip all HTML by default return ''; }; marked.setOptions({ renderer: renderer, breaks: true, // Convert \n to
gfm: true, // GitHub Flavored Markdown pedantic: false, smartLists: true, smartypants: true, headerIds: false, // Disable header IDs for security mangle: false, // Don't mangle email addresses sanitize: false, // We handle sanitization ourselves tables: true // Enable table parsing }); // Render markdown let html = marked.parse(text); // Add Tailwind classes to common elements html = html.replace(/

/g, '

'); html = html.replace(/

/g, '

'); html = html.replace(/

/g, '

'); html = html.replace(/

/g, '

'); html = html.replace(/

/g, '

'); html = html.replace(/

/g, '
'); html = html.replace(//g, ''); html = html.replace(//g, ''); html = html.replace(//g, ''); html = html.replace(/
/g, ''); html = html.replace(//g, ''); const result = `
${html}
`; // Cache the result (limit cache size to prevent memory issues) if (this.markdownCache.size > 50) { // Remove oldest entries const firstKey = this.markdownCache.keys().next().value; this.markdownCache.delete(firstKey); } this.markdownCache.set(cacheKey, result); return result; } catch (error) { console.error('Markdown rendering error:', error); return text; } }, getImprovedTextRendered() { const text = this.getImprovedText(); return this.renderMarkdownText(text); }, getImageData() { if (!this.currentSample) return null; const columns = this.api.detectColumns(null, this.currentSample); return columns.image ? this.currentSample[columns.image] : null; }, getImageSrc() { const imageData = this.getImageData(); return imageData?.src || ''; }, getImageDimensions() { const imageData = this.getImageData(); if (imageData?.width && imageData?.height) { return `${imageData.width}×${imageData.height}`; } return null; }, updateDiff() { const original = this.getOriginalText(); const improved = this.getImprovedText(); // Calculate statistics this.calculateStatistics(original, improved); // Use diff utility based on mode switch(this.diffMode) { case 'char': this.diffHtml = createCharacterDiff(original, improved); break; case 'word': this.diffHtml = createWordDiff(original, improved); break; case 'line': this.diffHtml = createLineDiff(original, improved); break; case 'markdown': // Pass the render function bound to this context this.diffHtml = createMarkdownDiff(original, improved, (text) => this.renderMarkdownText(text)); break; } }, calculateStatistics(original, improved) { // Calculate similarity this.similarity = calculateSimilarity(original, improved); // Character statistics const charDiff = this.getCharacterDiffStats(original, improved); this.charStats = charDiff; // Word statistics const originalWords = original.split(/\s+/).filter(w => w.length > 0); const improvedWords = improved.split(/\s+/).filter(w => w.length > 0); this.wordStats = { original: originalWords.length, improved: improvedWords.length }; }, getCharacterDiffStats(original, improved) { const dp = computeLCS(original, improved); const diff = buildDiff(original, improved, dp); let added = 0; let removed = 0; let unchanged = 0; for (const part of diff) { if (part.type === 'insert') { added += part.value.length; } else if (part.type === 'delete') { removed += part.value.length; } else { unchanged += part.value.length; } } return { total: original.length, added: added, removed: removed, unchanged: unchanged }; }, async handleImageError(event) { // Try to refresh the image URL console.log('Image failed to load, refreshing URL...'); try { const data = await this.api.refreshImageUrl( this.datasetId, this.datasetConfig, this.datasetSplit, this.currentIndex ); // Update the image source if (data.row && data.row[this.api.detectColumns(null, data.row).image]?.src) { event.target.src = data.row[this.api.detectColumns(null, data.row).image].src; } } catch (error) { console.error('Failed to refresh image URL:', error); // Set a placeholder image event.target.src = ''; } }, exportComparison() { const original = this.getOriginalText(); const improved = this.getImprovedText(); const metadata = { dataset: this.datasetId, page: this.currentIndex + 1, totalPages: this.totalSamples, exportDate: new Date().toISOString(), similarity: `${this.similarity}%`, statistics: { characters: this.charStats, words: this.wordStats } }; // Create export content let content = `OCR Text Comparison Export\n`; content += `==========================\n\n`; content += `Dataset: ${metadata.dataset}\n`; content += `Page: ${metadata.page} of ${metadata.totalPages}\n`; content += `Export Date: ${new Date().toLocaleString()}\n`; content += `Similarity: ${metadata.similarity}\n`; content += `Characters: ${metadata.statistics.characters.total} total, `; content += `${metadata.statistics.characters.added} added, `; content += `${metadata.statistics.characters.removed} removed\n`; content += `Words: ${metadata.statistics.words.original} → ${metadata.statistics.words.improved}\n`; content += `\n${'='.repeat(50)}\n\n`; content += `ORIGINAL OCR:\n`; content += `${'='.repeat(50)}\n`; content += original; content += `\n\n${'='.repeat(50)}\n\n`; // Include reasoning trace if available if (this.hasReasoningTrace && this.reasoningContent) { content += `MODEL REASONING:\n`; content += `${'='.repeat(50)}\n`; content += this.reasoningContent; content += `\n\n${'='.repeat(50)}\n\n`; } content += `IMPROVED OCR:\n`; content += `${'='.repeat(50)}\n`; content += improved; // Download file const blob = new Blob([content], { type: 'text/plain' }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = `ocr-comparison-${this.datasetId.replace('/', '-')}-page-${this.currentIndex + 1}.txt`; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); }, // Flow view methods async toggleFlowView() { this.showFlowView = !this.showFlowView; if (this.showFlowView) { // Reset to center around current page when opening this.flowStartIndex = Math.max(0, this.currentIndex - Math.floor(this.flowVisibleCount / 2)); await this.loadFlowItems(); } }, async loadFlowItems() { // Load thumbnails from flowStartIndex const startIdx = this.flowStartIndex; this.flowItems = []; // Load visible items for (let i = 0; i < this.flowVisibleCount && (startIdx + i) < this.totalSamples; i++) { const idx = startIdx + i; try { const data = await this.api.getRow( this.datasetId, this.datasetConfig, this.datasetSplit, idx ); const columns = this.api.detectColumns(null, data.row); const imageData = columns.image ? data.row[columns.image] : null; this.flowItems.push({ index: idx, imageSrc: imageData?.src || '', row: data.row }); } catch (error) { console.error(`Failed to load flow item ${idx}:`, error); } } }, scrollFlowLeft() { if (this.flowStartIndex > 0) { this.flowStartIndex = Math.max(0, this.flowStartIndex - this.flowVisibleCount); this.loadFlowItems(); } }, scrollFlowRight() { if (this.flowStartIndex < this.totalSamples - this.flowVisibleCount) { this.flowStartIndex = Math.min( this.totalSamples - this.flowVisibleCount, this.flowStartIndex + this.flowVisibleCount ); this.loadFlowItems(); } }, async jumpToFlowPage(index) { this.showFlowView = false; await this.loadSample(index); }, async handleFlowImageError(event, index) { // Try to refresh the image URL for flow item try { const data = await this.api.refreshImageUrl( this.datasetId, this.datasetConfig, this.datasetSplit, index ); if (data.row) { const columns = this.api.detectColumns(null, data.row); const imageData = columns.image ? data.row[columns.image] : null; if (imageData?.src) { event.target.src = imageData.src; // Update the flow item const flowItem = this.flowItems.find(item => item.index === index); if (flowItem) { flowItem.imageSrc = imageData.src; } } } } catch (error) { console.error('Failed to refresh flow image URL:', error); } }, // Dock methods async showDockPreview() { // Clear any hide timeout if (this.dockHideTimeout) { clearTimeout(this.dockHideTimeout); this.dockHideTimeout = null; } this.showDock = true; // Center dock around current page this.dockStartIndex = Math.max(0, Math.min( this.currentIndex - Math.floor(this.dockVisibleCount / 2), this.totalSamples - this.dockVisibleCount ) ); // Always reload dock items to show current position await this.loadDockItems(); }, hideDockPreview() { // Add a small delay to prevent flickering this.dockHideTimeout = setTimeout(() => { this.showDock = false; }, 300); }, async loadDockItems() { // Load thumbnails based on dock start index const endIdx = Math.min(this.totalSamples, this.dockStartIndex + this.dockVisibleCount); this.dockItems = []; for (let i = this.dockStartIndex; i < endIdx; i++) { try { const data = await this.api.getRow( this.datasetId, this.datasetConfig, this.datasetSplit, i ); const columns = this.api.detectColumns(null, data.row); const imageData = columns.image ? data.row[columns.image] : null; this.dockItems.push({ index: i, imageSrc: imageData?.src || '', row: data.row }); } catch (error) { console.error(`Failed to load dock item ${i}:`, error); } } }, async scrollDockLeft() { if (this.dockStartIndex > 0) { this.dockStartIndex = Math.max(0, this.dockStartIndex - Math.floor(this.dockVisibleCount / 2)); await this.loadDockItems(); } }, async scrollDockRight() { if (this.dockStartIndex < this.totalSamples - this.dockVisibleCount) { this.dockStartIndex = Math.min( this.totalSamples - this.dockVisibleCount, this.dockStartIndex + Math.floor(this.dockVisibleCount / 2) ); await this.loadDockItems(); } }, async jumpToDockPage(index) { this.showDock = false; await this.loadSample(index); }, // Update URL with current state updateURL() { const url = new URL(window.location); url.searchParams.set('dataset', this.datasetId); url.searchParams.set('index', this.currentIndex); url.searchParams.set('view', this.activeTab); url.searchParams.set('diff', this.diffMode); url.searchParams.set('markdown', this.renderMarkdown); // Only add reasoning parameter if there's a reasoning trace if (this.hasReasoningTrace) { url.searchParams.set('reasoning', this.showReasoning); } window.history.replaceState({}, '', url); }, // Copy shareable link to clipboard async copyShareLink() { const url = new URL(window.location); url.searchParams.set('dataset', this.datasetId); url.searchParams.set('index', this.currentIndex); url.searchParams.set('view', this.activeTab); url.searchParams.set('diff', this.diffMode); url.searchParams.set('markdown', this.renderMarkdown); // Only add reasoning parameter if there's a reasoning trace if (this.hasReasoningTrace) { url.searchParams.set('reasoning', this.showReasoning); } const shareUrl = url.toString(); try { await navigator.clipboard.writeText(shareUrl); // Show success feedback this.showShareSuccess = true; setTimeout(() => { this.showShareSuccess = false; }, 2000); return true; } catch (err) { // Fallback for older browsers const textArea = document.createElement('textarea'); textArea.value = shareUrl; textArea.style.position = 'fixed'; textArea.style.opacity = '0'; document.body.appendChild(textArea); textArea.select(); try { document.execCommand('copy'); // Show success feedback this.showShareSuccess = true; setTimeout(() => { this.showShareSuccess = false; }, 2000); return true; } catch (err) { console.error('Failed to copy link:', err); return false; } finally { document.body.removeChild(textArea); } } }, // Watch for diff mode changes initWatchers() { this.$watch('diffMode', () => { this.updateDiff(); this.updateURL(); }); this.$watch('currentSample', () => this.updateDiff()); this.$watch('activeTab', () => this.updateURL()); this.$watch('renderMarkdown', () => this.updateURL()); this.$watch('showReasoning', () => this.updateURL()); } })); });