/** * Main Alpine.js application for OCR Text Explorer */ document.addEventListener('alpine:init', () => { Alpine.data('ocrExplorer', () => ({ // Dataset state datasetId: 'davanstrien/exams-ocr', datasetConfig: 'default', datasetSplit: 'train', // Example datasets exampleDatasets: [ { id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' }, { id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }, { id: 'davanstrien/india-medical-ocr-test', name: 'India Medical OCR', description: 'Medical documents with NuMarkdown reasoning traces' } ], // Navigation state currentIndex: 0, totalSamples: null, currentSample: null, jumpToPage: '', // UI state loading: false, error: null, activeTab: 'comparison', diffMode: 'char', darkMode: false, showAbout: false, showFlowView: false, showDock: false, renderMarkdown: false, hasMarkdown: false, showShareSuccess: false, // Reasoning trace state hasReasoningTrace: false, showReasoning: false, reasoningContent: null, answerContent: null, reasoningStats: null, formattedReasoning: null, // Flow view state flowItems: [], flowStartIndex: 0, flowVisibleCount: 7, flowOffset: 0, // Dock state dockItems: [], dockHideTimeout: null, dockStartIndex: 0, dockVisibleCount: 10, // Computed diff HTML diffHtml: '', // Statistics similarity: 0, charStats: { total: 0, added: 0, removed: 0 }, wordStats: { original: 0, improved: 0 }, // API instance api: null, // Markdown cache markdownCache: new Map(), // Model info modelInfo: null, columnInfo: null, async init() { // Initialize API this.api = new DatasetAPI(); // Read URL parameters for deep linking const urlParams = new URLSearchParams(window.location.search); const urlDataset = urlParams.get('dataset'); const urlIndex = urlParams.get('index'); const urlView = urlParams.get('view'); const urlDiff = urlParams.get('diff'); const urlMarkdown = urlParams.get('markdown'); const urlReasoning = urlParams.get('reasoning'); // Apply URL parameters if present if (urlDataset) { this.datasetId = urlDataset; } if (urlView && ['comparison', 'diff', 'improved'].includes(urlView)) { this.activeTab = urlView; } if (urlDiff && ['char', 'word', 'line', 'markdown'].includes(urlDiff)) { this.diffMode = urlDiff; } if (urlMarkdown !== null) { this.renderMarkdown = urlMarkdown === 'true'; } if (urlReasoning !== null) { this.showReasoning = urlReasoning === 'true'; } // Apply dark mode from localStorage this.darkMode = localStorage.getItem('darkMode') === 'true'; this.$watch('darkMode', value => { localStorage.setItem('darkMode', value); document.documentElement.classList.toggle('dark', value); }); document.documentElement.classList.toggle('dark', this.darkMode); // Setup keyboard navigation this.setupKeyboardNavigation(); // Setup watchers for URL updates this.initWatchers(); // Check if we have a specific index to load const hasUrlIndex = urlIndex !== null; // Load initial dataset (skip initial load if we have a URL index) await this.loadDataset(hasUrlIndex); // Jump to specific index if provided in URL if (hasUrlIndex) { const index = parseInt(urlIndex); if (!isNaN(index) && index >= 0 && index < this.totalSamples) { await this.loadSample(index); } else { // If invalid index, load the first sample await this.loadSample(0); } } }, setupKeyboardNavigation() { document.addEventListener('keydown', (e) => { // Ignore if user is typing in input if (e.target.tagName === 'INPUT') return; switch(e.key) { case 'ArrowLeft': e.preventDefault(); if (e.shiftKey && this.showDock) { this.scrollDockLeft(); } else { this.previousSample(); } break; case 'ArrowRight': e.preventDefault(); if (e.shiftKey && this.showDock) { this.scrollDockRight(); } else { this.nextSample(); } break; case 'k': case 'K': e.preventDefault(); this.previousSample(); break; case 'j': case 'J': e.preventDefault(); this.nextSample(); break; case '1': this.activeTab = 'comparison'; break; case '2': this.activeTab = 'diff'; break; case '3': this.activeTab = 'improved'; break; case 'v': case 'V': // Toggle dock with V key if (this.showDock) { this.hideDockPreview(); } else { this.showDockPreview(); } break; } }); }, async loadDataset(skipInitialLoad = false) { this.loading = true; this.error = null; // Clear markdown cache when loading new dataset this.markdownCache.clear(); try { // Validate dataset await this.api.validateDataset(this.datasetId); // Get dataset info const info = await this.api.getDatasetInfo(this.datasetId); this.datasetConfig = info.defaultConfig; this.datasetSplit = info.defaultSplit; // Get total rows this.totalSamples = await this.api.getTotalRows( this.datasetId, this.datasetConfig, this.datasetSplit ); // Load first sample only if not skipping if (!skipInitialLoad) { this.currentIndex = 0; await this.loadSample(0); } } catch (error) { this.error = error.message; } finally { this.loading = false; } }, async loadSample(index) { try { const data = await this.api.getRow( this.datasetId, this.datasetConfig, this.datasetSplit, index ); this.currentSample = data.row; this.currentIndex = index; this.columnInfo = data.columns; // Extract model info if available this.extractModelInfo(); // Debug: Log column info console.log('Column info:', this.columnInfo); console.log('Current sample keys:', Object.keys(this.currentSample)); // Check if improved text contains markdown and reasoning traces const improvedText = this.getImprovedText(); this.parseReasoningTrace(improvedText); this.hasMarkdown = this.detectMarkdown(this.answerContent || improvedText); // Update diff when sample changes this.updateDiff(); // Update URL without triggering navigation this.updateURL(); } catch (error) { this.error = `Failed to load sample: ${error.message}`; } }, async nextSample() { if (this.currentIndex < this.totalSamples - 1) { await this.loadSample(this.currentIndex + 1); } }, async previousSample() { if (this.currentIndex > 0) { await this.loadSample(this.currentIndex - 1); } }, async jumpToSample() { const pageNum = parseInt(this.jumpToPage); if (!isNaN(pageNum) && pageNum >= 1 && pageNum <= this.totalSamples) { // Convert 1-based page number to 0-based index await this.loadSample(pageNum - 1); // Clear the input after jumping this.jumpToPage = ''; } else { // Show error or just reset this.jumpToPage = ''; } }, async selectDataset(datasetId) { this.datasetId = datasetId; await this.loadDataset(); }, extractModelInfo() { this.modelInfo = null; if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) { console.log('No inference info column detected'); return; } const inferenceData = this.currentSample[this.columnInfo.inferenceInfo]; if (!inferenceData) { console.log('No inference data in current sample'); return; } console.log('Raw inference data:', inferenceData); const parsed = this.api.parseInferenceInfo(inferenceData); console.log('Parsed inference data:', parsed); if (parsed) { const formattedInfo = this.formatModelInfo(parsed); // Ensure it's a plain object, not a proxy this.modelInfo = formattedInfo ? {...formattedInfo} : null; console.log('Formatted model info:', this.modelInfo); } }, formatModelInfo(info) { if (!info) return null; return { modelId: info.model_id || 'Unknown', modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown', processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null, scriptVersion: info.script_version || null, batchSize: info.batch_size || null, maxTokens: info.max_tokens || null, scriptUrl: info.script_url || null, columnName: info.column_name || null }; }, parseReasoningTrace(text) { // Reset reasoning state this.hasReasoningTrace = false; this.reasoningContent = null; this.answerContent = null; this.reasoningStats = null; this.formattedReasoning = null; if (!text || !window.ReasoningParser) return; // Check if text contains reasoning trace if (ReasoningParser.detectReasoningTrace(text)) { const parsed = ReasoningParser.parseReasoningContent(text); if (parsed.hasReasoning) { this.hasReasoningTrace = true; this.reasoningContent = parsed.reasoning; this.answerContent = parsed.answer; this.formattedReasoning = ReasoningParser.formatReasoningSteps(parsed.reasoning); this.reasoningStats = ReasoningParser.getReasoningStats(parsed); console.log('Reasoning trace detected:', this.reasoningStats); } else { // No reasoning found, use original text as answer this.answerContent = text; } } else { // No reasoning markers, use original text this.answerContent = text; } }, getOriginalText() { if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); return this.currentSample[columns.originalText] || 'No original text found'; }, getImprovedText() { if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); const rawText = this.currentSample[columns.improvedText] || 'No improved text found'; // If we have parsed answer content from reasoning trace, use that // Otherwise return the raw text return this.hasReasoningTrace && this.answerContent ? this.answerContent : rawText; }, getRawImprovedText() { // Get the raw improved text without parsing reasoning traces if (!this.currentSample) return ''; const columns = this.api.detectColumns(null, this.currentSample); return this.currentSample[columns.improvedText] || 'No improved text found'; }, detectMarkdown(text) { // Check for common markdown patterns const markdownPatterns = [ /^#{1,6}\s/m, // Headers /\*\*[^*]+\*\*/, // Bold /\*[^*]+\*/, // Italic /\[[^\]]+\]\([^)]+\)/, // Links /^[-*+]\s/m, // Lists /^\d+\.\s/m, // Numbered lists /^>/m, // Blockquotes /```[\s\S]*?```/, // Code blocks /`[^`]+`/, // Inline code /\|.*\|.*\|/m, // Tables (basic detection) /
/g, ' | '); html = html.replace(/ | /g, ' | ');
const result = ` ${html} `;
// Cache the result (limit cache size to prevent memory issues)
if (this.markdownCache.size > 50) {
// Remove oldest entries
const firstKey = this.markdownCache.keys().next().value;
this.markdownCache.delete(firstKey);
}
this.markdownCache.set(cacheKey, result);
return result;
} catch (error) {
console.error('Markdown rendering error:', error);
return text;
}
},
getImprovedTextRendered() {
const text = this.getImprovedText();
return this.renderMarkdownText(text);
},
getImageData() {
if (!this.currentSample) return null;
const columns = this.api.detectColumns(null, this.currentSample);
return columns.image ? this.currentSample[columns.image] : null;
},
getImageSrc() {
const imageData = this.getImageData();
return imageData?.src || '';
},
getImageDimensions() {
const imageData = this.getImageData();
if (imageData?.width && imageData?.height) {
return `${imageData.width}×${imageData.height}`;
}
return null;
},
updateDiff() {
const original = this.getOriginalText();
const improved = this.getImprovedText();
// Calculate statistics
this.calculateStatistics(original, improved);
// Use diff utility based on mode
switch(this.diffMode) {
case 'char':
this.diffHtml = createCharacterDiff(original, improved);
break;
case 'word':
this.diffHtml = createWordDiff(original, improved);
break;
case 'line':
this.diffHtml = createLineDiff(original, improved);
break;
case 'markdown':
// Pass the render function bound to this context
this.diffHtml = createMarkdownDiff(original, improved, (text) => this.renderMarkdownText(text));
break;
}
},
calculateStatistics(original, improved) {
// Calculate similarity
this.similarity = calculateSimilarity(original, improved);
// Character statistics
const charDiff = this.getCharacterDiffStats(original, improved);
this.charStats = charDiff;
// Word statistics
const originalWords = original.split(/\s+/).filter(w => w.length > 0);
const improvedWords = improved.split(/\s+/).filter(w => w.length > 0);
this.wordStats = {
original: originalWords.length,
improved: improvedWords.length
};
},
getCharacterDiffStats(original, improved) {
const dp = computeLCS(original, improved);
const diff = buildDiff(original, improved, dp);
let added = 0;
let removed = 0;
let unchanged = 0;
for (const part of diff) {
if (part.type === 'insert') {
added += part.value.length;
} else if (part.type === 'delete') {
removed += part.value.length;
} else {
unchanged += part.value.length;
}
}
return {
total: original.length,
added: added,
removed: removed,
unchanged: unchanged
};
},
async handleImageError(event) {
// Try to refresh the image URL
console.log('Image failed to load, refreshing URL...');
try {
const data = await this.api.refreshImageUrl(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
this.currentIndex
);
// Update the image source
if (data.row && data.row[this.api.detectColumns(null, data.row).image]?.src) {
event.target.src = data.row[this.api.detectColumns(null, data.row).image].src;
}
} catch (error) {
console.error('Failed to refresh image URL:', error);
// Set a placeholder image
event.target.src = '';
}
},
exportComparison() {
const original = this.getOriginalText();
const improved = this.getImprovedText();
const metadata = {
dataset: this.datasetId,
page: this.currentIndex + 1,
totalPages: this.totalSamples,
exportDate: new Date().toISOString(),
similarity: `${this.similarity}%`,
statistics: {
characters: this.charStats,
words: this.wordStats
}
};
// Create export content
let content = `OCR Text Comparison Export\n`;
content += `==========================\n\n`;
content += `Dataset: ${metadata.dataset}\n`;
content += `Page: ${metadata.page} of ${metadata.totalPages}\n`;
content += `Export Date: ${new Date().toLocaleString()}\n`;
content += `Similarity: ${metadata.similarity}\n`;
content += `Characters: ${metadata.statistics.characters.total} total, `;
content += `${metadata.statistics.characters.added} added, `;
content += `${metadata.statistics.characters.removed} removed\n`;
content += `Words: ${metadata.statistics.words.original} → ${metadata.statistics.words.improved}\n`;
content += `\n${'='.repeat(50)}\n\n`;
content += `ORIGINAL OCR:\n`;
content += `${'='.repeat(50)}\n`;
content += original;
content += `\n\n${'='.repeat(50)}\n\n`;
// Include reasoning trace if available
if (this.hasReasoningTrace && this.reasoningContent) {
content += `MODEL REASONING:\n`;
content += `${'='.repeat(50)}\n`;
content += this.reasoningContent;
content += `\n\n${'='.repeat(50)}\n\n`;
}
content += `IMPROVED OCR:\n`;
content += `${'='.repeat(50)}\n`;
content += improved;
// Download file
const blob = new Blob([content], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `ocr-comparison-${this.datasetId.replace('/', '-')}-page-${this.currentIndex + 1}.txt`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
// Flow view methods
async toggleFlowView() {
this.showFlowView = !this.showFlowView;
if (this.showFlowView) {
// Reset to center around current page when opening
this.flowStartIndex = Math.max(0, this.currentIndex - Math.floor(this.flowVisibleCount / 2));
await this.loadFlowItems();
}
},
async loadFlowItems() {
// Load thumbnails from flowStartIndex
const startIdx = this.flowStartIndex;
this.flowItems = [];
// Load visible items
for (let i = 0; i < this.flowVisibleCount && (startIdx + i) < this.totalSamples; i++) {
const idx = startIdx + i;
try {
const data = await this.api.getRow(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
idx
);
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
this.flowItems.push({
index: idx,
imageSrc: imageData?.src || '',
row: data.row
});
} catch (error) {
console.error(`Failed to load flow item ${idx}:`, error);
}
}
},
scrollFlowLeft() {
if (this.flowStartIndex > 0) {
this.flowStartIndex = Math.max(0, this.flowStartIndex - this.flowVisibleCount);
this.loadFlowItems();
}
},
scrollFlowRight() {
if (this.flowStartIndex < this.totalSamples - this.flowVisibleCount) {
this.flowStartIndex = Math.min(
this.totalSamples - this.flowVisibleCount,
this.flowStartIndex + this.flowVisibleCount
);
this.loadFlowItems();
}
},
async jumpToFlowPage(index) {
this.showFlowView = false;
await this.loadSample(index);
},
async handleFlowImageError(event, index) {
// Try to refresh the image URL for flow item
try {
const data = await this.api.refreshImageUrl(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
index
);
if (data.row) {
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
if (imageData?.src) {
event.target.src = imageData.src;
// Update the flow item
const flowItem = this.flowItems.find(item => item.index === index);
if (flowItem) {
flowItem.imageSrc = imageData.src;
}
}
}
} catch (error) {
console.error('Failed to refresh flow image URL:', error);
}
},
// Dock methods
async showDockPreview() {
// Clear any hide timeout
if (this.dockHideTimeout) {
clearTimeout(this.dockHideTimeout);
this.dockHideTimeout = null;
}
this.showDock = true;
// Center dock around current page
this.dockStartIndex = Math.max(0,
Math.min(
this.currentIndex - Math.floor(this.dockVisibleCount / 2),
this.totalSamples - this.dockVisibleCount
)
);
// Always reload dock items to show current position
await this.loadDockItems();
},
hideDockPreview() {
// Add a small delay to prevent flickering
this.dockHideTimeout = setTimeout(() => {
this.showDock = false;
}, 300);
},
async loadDockItems() {
// Load thumbnails based on dock start index
const endIdx = Math.min(this.totalSamples, this.dockStartIndex + this.dockVisibleCount);
this.dockItems = [];
for (let i = this.dockStartIndex; i < endIdx; i++) {
try {
const data = await this.api.getRow(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
i
);
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
this.dockItems.push({
index: i,
imageSrc: imageData?.src || '',
row: data.row
});
} catch (error) {
console.error(`Failed to load dock item ${i}:`, error);
}
}
},
async scrollDockLeft() {
if (this.dockStartIndex > 0) {
this.dockStartIndex = Math.max(0, this.dockStartIndex - Math.floor(this.dockVisibleCount / 2));
await this.loadDockItems();
}
},
async scrollDockRight() {
if (this.dockStartIndex < this.totalSamples - this.dockVisibleCount) {
this.dockStartIndex = Math.min(
this.totalSamples - this.dockVisibleCount,
this.dockStartIndex + Math.floor(this.dockVisibleCount / 2)
);
await this.loadDockItems();
}
},
async jumpToDockPage(index) {
this.showDock = false;
await this.loadSample(index);
},
// Update URL with current state
updateURL() {
const url = new URL(window.location);
url.searchParams.set('dataset', this.datasetId);
url.searchParams.set('index', this.currentIndex);
url.searchParams.set('view', this.activeTab);
url.searchParams.set('diff', this.diffMode);
url.searchParams.set('markdown', this.renderMarkdown);
// Only add reasoning parameter if there's a reasoning trace
if (this.hasReasoningTrace) {
url.searchParams.set('reasoning', this.showReasoning);
}
window.history.replaceState({}, '', url);
},
// Copy shareable link to clipboard
async copyShareLink() {
const url = new URL(window.location);
url.searchParams.set('dataset', this.datasetId);
url.searchParams.set('index', this.currentIndex);
url.searchParams.set('view', this.activeTab);
url.searchParams.set('diff', this.diffMode);
url.searchParams.set('markdown', this.renderMarkdown);
// Only add reasoning parameter if there's a reasoning trace
if (this.hasReasoningTrace) {
url.searchParams.set('reasoning', this.showReasoning);
}
const shareUrl = url.toString();
try {
await navigator.clipboard.writeText(shareUrl);
// Show success feedback
this.showShareSuccess = true;
setTimeout(() => {
this.showShareSuccess = false;
}, 2000);
return true;
} catch (err) {
// Fallback for older browsers
const textArea = document.createElement('textarea');
textArea.value = shareUrl;
textArea.style.position = 'fixed';
textArea.style.opacity = '0';
document.body.appendChild(textArea);
textArea.select();
try {
document.execCommand('copy');
// Show success feedback
this.showShareSuccess = true;
setTimeout(() => {
this.showShareSuccess = false;
}, 2000);
return true;
} catch (err) {
console.error('Failed to copy link:', err);
return false;
} finally {
document.body.removeChild(textArea);
}
}
},
// Watch for diff mode changes
initWatchers() {
this.$watch('diffMode', () => {
this.updateDiff();
this.updateURL();
});
this.$watch('currentSample', () => this.updateDiff());
this.$watch('activeTab', () => this.updateURL());
this.$watch('renderMarkdown', () => this.updateURL());
this.$watch('showReasoning', () => this.updateURL());
}
}));
}); |
---|