Add 2 files
Browse files- index.html +133 -14
- prompts.txt +2 -1
index.html
CHANGED
@@ -7,6 +7,8 @@
|
|
7 |
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/pdf-lib.min.js"></script>
|
9 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
|
|
|
10 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
11 |
<style>
|
12 |
.dropzone {
|
@@ -138,7 +140,7 @@
|
|
138 |
</div>
|
139 |
|
140 |
<!-- Upload source document -->
|
141 |
-
<div class="bg-white rounded-lg shadow p-6 mb-8">
|
142 |
<h3 class="text-lg font-medium text-gray-900 mb-4">Upload Source Document</h3>
|
143 |
<p class="text-sm text-gray-500 mb-6">Upload a completed invoice, form, or PDF document that contains the data you want to extract.</p>
|
144 |
|
@@ -214,7 +216,7 @@
|
|
214 |
<div class="p-4">
|
215 |
<div class="document-preview bg-gray-100 rounded-lg overflow-hidden relative">
|
216 |
<div class="absolute top-0 left-0 right-0 bg-gray-800 text-white py-1 px-3 text-sm flex justify-between items-center">
|
217 |
-
<span>source_document.pdf</span>
|
218 |
<div class="flex space-x-2">
|
219 |
<button class="text-gray-300 hover:text-white">
|
220 |
<i class="fas fa-search-plus"></i>
|
@@ -229,11 +231,11 @@
|
|
229 |
</div>
|
230 |
<div class="absolute bottom-0 left-0 right-0 bg-gray-100 py-2 px-4 flex justify-between items-center border-t border-gray-200">
|
231 |
<button id="prevPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
232 |
-
<i class="fas fa-chevron-left"></i> Previous
|
233 |
</button>
|
234 |
<span class="text-sm text-gray-600">Page <span id="currentPage">1</span> of <span id="totalPages">3</span></span>
|
235 |
<button id="nextPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
236 |
-
Next <i class="fas fa-chevron-right"></i>
|
237 |
</button>
|
238 |
</div>
|
239 |
</div>
|
@@ -377,7 +379,7 @@
|
|
377 |
<h4 class="font-medium text-gray-900 mb-3">Completed Document Preview</h4>
|
378 |
<div class="document-preview bg-gray-100 rounded-lg overflow-hidden relative">
|
379 |
<div class="absolute top-0 left-0 right-0 bg-gray-800 text-white py-1 px-3 text-sm flex justify-between items-center">
|
380 |
-
<span>completed_document.pdf</span>
|
381 |
<div class="flex space-x-2">
|
382 |
<button class="text-gray-300 hover:text-white">
|
383 |
<i class="fas fa-search-plus"></i>
|
@@ -392,11 +394,11 @@
|
|
392 |
</div>
|
393 |
<div class="absolute bottom-0 left-0 right-0 bg-gray-100 py-2 px-4 flex justify-between items-center border-t border-gray-200">
|
394 |
<button id="prevCompletedPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
395 |
-
<i class="fas fa-chevron-left"></i> Previous
|
396 |
</button>
|
397 |
<span class="text-sm text-gray-600">Page <span id="currentCompletedPage">1</span> of <span id="totalCompletedPages">3</span></span>
|
398 |
<button id="nextCompletedPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
399 |
-
Next <i class="fas fa-chevron-right"></i>
|
400 |
</button>
|
401 |
</div>
|
402 |
</div>
|
@@ -536,6 +538,8 @@
|
|
536 |
let totalPages = 3;
|
537 |
let currentCompletedPage = 1;
|
538 |
let totalCompletedPages = 3;
|
|
|
|
|
539 |
|
540 |
// DOM elements
|
541 |
const sourceDropzone = document.getElementById('sourceDropzone');
|
@@ -564,6 +568,12 @@
|
|
564 |
const showingStartSpan = document.getElementById('showingStart');
|
565 |
const showingEndSpan = document.getElementById('showingEnd');
|
566 |
const totalFieldsSpan = document.getElementById('totalFields');
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
|
568 |
// Initialize the app
|
569 |
function init() {
|
@@ -601,6 +611,11 @@
|
|
601 |
// Disable previous page buttons initially
|
602 |
prevPageButton.disabled = true;
|
603 |
prevCompletedPageButton.disabled = true;
|
|
|
|
|
|
|
|
|
|
|
604 |
}
|
605 |
|
606 |
// Set up drag and drop functionality
|
@@ -633,10 +648,13 @@
|
|
633 |
function handleSourceFileUpload() {
|
634 |
if (sourceFileInput.files.length) {
|
635 |
const file = sourceFileInput.files[0];
|
|
|
|
|
636 |
console.log('Source file uploaded:', file.name);
|
637 |
|
638 |
// Show processing status
|
639 |
processingStatus.classList.remove('hidden');
|
|
|
640 |
|
641 |
// Simulate processing
|
642 |
setTimeout(() => {
|
@@ -644,6 +662,9 @@
|
|
644 |
extractedDataPreview.classList.remove('hidden');
|
645 |
populateExtractedFieldsTable();
|
646 |
updateProcessSteps(2);
|
|
|
|
|
|
|
647 |
}, 3000);
|
648 |
}
|
649 |
}
|
@@ -652,6 +673,8 @@
|
|
652 |
function handleTemplateFileUpload() {
|
653 |
if (templateFileInput.files.length) {
|
654 |
const file = templateFileInput.files[0];
|
|
|
|
|
655 |
console.log('Template file uploaded:', file.name);
|
656 |
|
657 |
// Show field mapping section
|
@@ -865,8 +888,11 @@
|
|
865 |
// Edit field
|
866 |
function editField(fieldId) {
|
867 |
const field = extractedFields.find(f => f.id === fieldId);
|
868 |
-
|
869 |
-
|
|
|
|
|
|
|
870 |
}
|
871 |
|
872 |
// Delete field
|
@@ -937,9 +963,7 @@
|
|
937 |
function handleNextStep() {
|
938 |
if (currentStep === 1) {
|
939 |
// From upload source to upload template
|
940 |
-
|
941 |
-
nextButton.textContent = 'Continue';
|
942 |
-
backButton.classList.remove('hidden');
|
943 |
} else if (currentStep === 2) {
|
944 |
// From upload template to field mapping (handled in upload handler)
|
945 |
} else if (currentStep === 3) {
|
@@ -947,6 +971,7 @@
|
|
947 |
fieldMappingSection.classList.add('hidden');
|
948 |
finalOutputSection.classList.remove('hidden');
|
949 |
nextButton.textContent = 'Finish';
|
|
|
950 |
} else if (currentStep === 4) {
|
951 |
// Finish the process
|
952 |
alert('Document processing completed!');
|
@@ -959,20 +984,23 @@
|
|
959 |
if (currentStep === 2) {
|
960 |
// From upload template back to upload source
|
961 |
templateUploadSection.classList.add('hidden');
|
|
|
|
|
962 |
backButton.classList.add('hidden');
|
963 |
nextButton.textContent = 'Continue';
|
|
|
964 |
} else if (currentStep === 3) {
|
965 |
// From field mapping back to upload template
|
966 |
fieldMappingSection.classList.add('hidden');
|
967 |
templateUploadSection.classList.remove('hidden');
|
|
|
968 |
} else if (currentStep === 4) {
|
969 |
// From final output back to field mapping
|
970 |
finalOutputSection.classList.add('hidden');
|
971 |
fieldMappingSection.classList.remove('hidden');
|
972 |
nextButton.textContent = 'Continue';
|
|
|
973 |
}
|
974 |
-
|
975 |
-
updateProcessSteps(currentStep - 1);
|
976 |
}
|
977 |
|
978 |
// Update process steps UI
|
@@ -1015,6 +1043,97 @@
|
|
1015 |
} else {
|
1016 |
nextButton.textContent = 'Continue';
|
1017 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1018 |
}
|
1019 |
|
1020 |
// Initialize the app when DOM is loaded
|
|
|
7 |
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/pdf-lib.min.js"></script>
|
9 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
10 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js"></script>
|
11 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js"></script>
|
12 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
13 |
<style>
|
14 |
.dropzone {
|
|
|
140 |
</div>
|
141 |
|
142 |
<!-- Upload source document -->
|
143 |
+
<div id="sourceUploadSection" class="bg-white rounded-lg shadow p-6 mb-8">
|
144 |
<h3 class="text-lg font-medium text-gray-900 mb-4">Upload Source Document</h3>
|
145 |
<p class="text-sm text-gray-500 mb-6">Upload a completed invoice, form, or PDF document that contains the data you want to extract.</p>
|
146 |
|
|
|
216 |
<div class="p-4">
|
217 |
<div class="document-preview bg-gray-100 rounded-lg overflow-hidden relative">
|
218 |
<div class="absolute top-0 left-0 right-0 bg-gray-800 text-white py-1 px-3 text-sm flex justify-between items-center">
|
219 |
+
<span id="sourceFileName">source_document.pdf</span>
|
220 |
<div class="flex space-x-2">
|
221 |
<button class="text-gray-300 hover:text-white">
|
222 |
<i class="fas fa-search-plus"></i>
|
|
|
231 |
</div>
|
232 |
<div class="absolute bottom-0 left-0 right-0 bg-gray-100 py-2 px-4 flex justify-between items-center border-t border-gray-200">
|
233 |
<button id="prevPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
234 |
+
<i class="fas fa-chevron-left mr-1"></i> Previous
|
235 |
</button>
|
236 |
<span class="text-sm text-gray-600">Page <span id="currentPage">1</span> of <span id="totalPages">3</span></span>
|
237 |
<button id="nextPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
238 |
+
Next <i class="fas fa-chevron-right ml-1"></i>
|
239 |
</button>
|
240 |
</div>
|
241 |
</div>
|
|
|
379 |
<h4 class="font-medium text-gray-900 mb-3">Completed Document Preview</h4>
|
380 |
<div class="document-preview bg-gray-100 rounded-lg overflow-hidden relative">
|
381 |
<div class="absolute top-0 left-0 right-0 bg-gray-800 text-white py-1 px-3 text-sm flex justify-between items-center">
|
382 |
+
<span id="completedDocumentName">completed_document.pdf</span>
|
383 |
<div class="flex space-x-2">
|
384 |
<button class="text-gray-300 hover:text-white">
|
385 |
<i class="fas fa-search-plus"></i>
|
|
|
394 |
</div>
|
395 |
<div class="absolute bottom-0 left-0 right-0 bg-gray-100 py-2 px-4 flex justify-between items-center border-t border-gray-200">
|
396 |
<button id="prevCompletedPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
397 |
+
<i class="fas fa-chevron-left mr-1"></i> Previous
|
398 |
</button>
|
399 |
<span class="text-sm text-gray-600">Page <span id="currentCompletedPage">1</span> of <span id="totalCompletedPages">3</span></span>
|
400 |
<button id="nextCompletedPage" class="text-gray-600 hover:text-blue-600 disabled:text-gray-300">
|
401 |
+
Next <i class="fas fa-chevron-right ml-1"></i>
|
402 |
</button>
|
403 |
</div>
|
404 |
</div>
|
|
|
538 |
let totalPages = 3;
|
539 |
let currentCompletedPage = 1;
|
540 |
let totalCompletedPages = 3;
|
541 |
+
let sourceFileName = '';
|
542 |
+
let templateFileName = '';
|
543 |
|
544 |
// DOM elements
|
545 |
const sourceDropzone = document.getElementById('sourceDropzone');
|
|
|
568 |
const showingStartSpan = document.getElementById('showingStart');
|
569 |
const showingEndSpan = document.getElementById('showingEnd');
|
570 |
const totalFieldsSpan = document.getElementById('totalFields');
|
571 |
+
const sourceFileNameSpan = document.getElementById('sourceFileName');
|
572 |
+
const completedDocumentNameSpan = document.getElementById('completedDocumentName');
|
573 |
+
const sourceUploadSection = document.getElementById('sourceUploadSection');
|
574 |
+
const downloadPdfButton = document.getElementById('downloadPdf');
|
575 |
+
const downloadJsonButton = document.getElementById('downloadJson');
|
576 |
+
const downloadCsvButton = document.getElementById('downloadCsv');
|
577 |
|
578 |
// Initialize the app
|
579 |
function init() {
|
|
|
611 |
// Disable previous page buttons initially
|
612 |
prevPageButton.disabled = true;
|
613 |
prevCompletedPageButton.disabled = true;
|
614 |
+
|
615 |
+
// Set up export buttons
|
616 |
+
downloadPdfButton.addEventListener('click', exportPdf);
|
617 |
+
downloadJsonButton.addEventListener('click', exportJson);
|
618 |
+
downloadCsvButton.addEventListener('click', exportCsv);
|
619 |
}
|
620 |
|
621 |
// Set up drag and drop functionality
|
|
|
648 |
function handleSourceFileUpload() {
|
649 |
if (sourceFileInput.files.length) {
|
650 |
const file = sourceFileInput.files[0];
|
651 |
+
sourceFileName = file.name;
|
652 |
+
sourceFileNameSpan.textContent = sourceFileName;
|
653 |
console.log('Source file uploaded:', file.name);
|
654 |
|
655 |
// Show processing status
|
656 |
processingStatus.classList.remove('hidden');
|
657 |
+
sourceUploadSection.classList.add('hidden');
|
658 |
|
659 |
// Simulate processing
|
660 |
setTimeout(() => {
|
|
|
662 |
extractedDataPreview.classList.remove('hidden');
|
663 |
populateExtractedFieldsTable();
|
664 |
updateProcessSteps(2);
|
665 |
+
|
666 |
+
// Enable template upload section
|
667 |
+
templateUploadSection.classList.remove('hidden');
|
668 |
}, 3000);
|
669 |
}
|
670 |
}
|
|
|
673 |
function handleTemplateFileUpload() {
|
674 |
if (templateFileInput.files.length) {
|
675 |
const file = templateFileInput.files[0];
|
676 |
+
templateFileName = file.name;
|
677 |
+
completedDocumentNameSpan.textContent = `filled_${templateFileName}`;
|
678 |
console.log('Template file uploaded:', file.name);
|
679 |
|
680 |
// Show field mapping section
|
|
|
888 |
// Edit field
|
889 |
function editField(fieldId) {
|
890 |
const field = extractedFields.find(f => f.id === fieldId);
|
891 |
+
const newValue = prompt(`Edit field: ${field.name}\nCurrent value: ${field.value}`, field.value);
|
892 |
+
if (newValue !== null) {
|
893 |
+
field.value = newValue;
|
894 |
+
populateExtractedFieldsTable();
|
895 |
+
}
|
896 |
}
|
897 |
|
898 |
// Delete field
|
|
|
963 |
function handleNextStep() {
|
964 |
if (currentStep === 1) {
|
965 |
// From upload source to upload template
|
966 |
+
// This is now handled automatically after source file processing
|
|
|
|
|
967 |
} else if (currentStep === 2) {
|
968 |
// From upload template to field mapping (handled in upload handler)
|
969 |
} else if (currentStep === 3) {
|
|
|
971 |
fieldMappingSection.classList.add('hidden');
|
972 |
finalOutputSection.classList.remove('hidden');
|
973 |
nextButton.textContent = 'Finish';
|
974 |
+
updateProcessSteps(4);
|
975 |
} else if (currentStep === 4) {
|
976 |
// Finish the process
|
977 |
alert('Document processing completed!');
|
|
|
984 |
if (currentStep === 2) {
|
985 |
// From upload template back to upload source
|
986 |
templateUploadSection.classList.add('hidden');
|
987 |
+
extractedDataPreview.classList.add('hidden');
|
988 |
+
sourceUploadSection.classList.remove('hidden');
|
989 |
backButton.classList.add('hidden');
|
990 |
nextButton.textContent = 'Continue';
|
991 |
+
updateProcessSteps(1);
|
992 |
} else if (currentStep === 3) {
|
993 |
// From field mapping back to upload template
|
994 |
fieldMappingSection.classList.add('hidden');
|
995 |
templateUploadSection.classList.remove('hidden');
|
996 |
+
updateProcessSteps(2);
|
997 |
} else if (currentStep === 4) {
|
998 |
// From final output back to field mapping
|
999 |
finalOutputSection.classList.add('hidden');
|
1000 |
fieldMappingSection.classList.remove('hidden');
|
1001 |
nextButton.textContent = 'Continue';
|
1002 |
+
updateProcessSteps(3);
|
1003 |
}
|
|
|
|
|
1004 |
}
|
1005 |
|
1006 |
// Update process steps UI
|
|
|
1043 |
} else {
|
1044 |
nextButton.textContent = 'Continue';
|
1045 |
}
|
1046 |
+
|
1047 |
+
// Update back button visibility
|
1048 |
+
if (step > 1) {
|
1049 |
+
backButton.classList.remove('hidden');
|
1050 |
+
} else {
|
1051 |
+
backButton.classList.add('hidden');
|
1052 |
+
}
|
1053 |
+
}
|
1054 |
+
|
1055 |
+
// Export PDF
|
1056 |
+
function exportPdf() {
|
1057 |
+
// In a real app, this would use PDF-lib to create a filled PDF
|
1058 |
+
// For demo purposes, we'll create a simple PDF with the data
|
1059 |
+
|
1060 |
+
// Create a JSON representation of the mapped data
|
1061 |
+
const mappedData = {};
|
1062 |
+
templateFields.forEach(templateField => {
|
1063 |
+
if (templateField.mappedTo) {
|
1064 |
+
const sourceField = extractedFields.find(f => f.id === templateField.mappedTo);
|
1065 |
+
mappedData[templateField.name] = sourceField.value;
|
1066 |
+
}
|
1067 |
+
});
|
1068 |
+
|
1069 |
+
// Create a simple PDF (simulated)
|
1070 |
+
const { jsPDF } = window.jspdf;
|
1071 |
+
const doc = new jsPDF();
|
1072 |
+
|
1073 |
+
// Add title
|
1074 |
+
doc.setFontSize(20);
|
1075 |
+
doc.text('Completed Document', 105, 20, { align: 'center' });
|
1076 |
+
|
1077 |
+
// Add fields
|
1078 |
+
doc.setFontSize(12);
|
1079 |
+
let y = 40;
|
1080 |
+
templateFields.forEach(templateField => {
|
1081 |
+
if (templateField.mappedTo) {
|
1082 |
+
const sourceField = extractedFields.find(f => f.id === templateField.mappedTo);
|
1083 |
+
doc.text(`${templateField.name}: ${sourceField.value}`, 20, y);
|
1084 |
+
y += 10;
|
1085 |
+
}
|
1086 |
+
});
|
1087 |
+
|
1088 |
+
// Save the PDF
|
1089 |
+
doc.save(`filled_${templateFileName || 'document'}.pdf`);
|
1090 |
+
|
1091 |
+
alert('PDF document generated successfully!');
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
// Export JSON
|
1095 |
+
function exportJson() {
|
1096 |
+
// Create a JSON representation of the mapped data
|
1097 |
+
const mappedData = {};
|
1098 |
+
templateFields.forEach(templateField => {
|
1099 |
+
if (templateField.mappedTo) {
|
1100 |
+
const sourceField = extractedFields.find(f => f.id === templateField.mappedTo);
|
1101 |
+
mappedData[templateField.name] = {
|
1102 |
+
value: sourceField.value,
|
1103 |
+
confidence: templateField.confidence,
|
1104 |
+
sourceField: sourceField.name
|
1105 |
+
};
|
1106 |
+
}
|
1107 |
+
});
|
1108 |
+
|
1109 |
+
// Create JSON string
|
1110 |
+
const jsonStr = JSON.stringify(mappedData, null, 2);
|
1111 |
+
|
1112 |
+
// Create blob and download
|
1113 |
+
const blob = new Blob([jsonStr], { type: 'application/json' });
|
1114 |
+
saveAs(blob, `document_data_${new Date().toISOString().slice(0, 10)}.json`);
|
1115 |
+
|
1116 |
+
alert('JSON data exported successfully!');
|
1117 |
+
}
|
1118 |
+
|
1119 |
+
// Export CSV
|
1120 |
+
function exportCsv() {
|
1121 |
+
// Create CSV header
|
1122 |
+
let csv = 'Field Name,Value,Confidence,Source Field\n';
|
1123 |
+
|
1124 |
+
// Add rows for each mapped field
|
1125 |
+
templateFields.forEach(templateField => {
|
1126 |
+
if (templateField.mappedTo) {
|
1127 |
+
const sourceField = extractedFields.find(f => f.id === templateField.mappedTo);
|
1128 |
+
csv += `"${templateField.name}","${sourceField.value}",${templateField.confidence},"${sourceField.name}"\n`;
|
1129 |
+
}
|
1130 |
+
});
|
1131 |
+
|
1132 |
+
// Create blob and download
|
1133 |
+
const blob = new Blob([csv], { type: 'text/csv;charset=utf-8;' });
|
1134 |
+
saveAs(blob, `document_data_${new Date().toISOString().slice(0, 10)}.csv`);
|
1135 |
+
|
1136 |
+
alert('CSV data exported successfully!');
|
1137 |
}
|
1138 |
|
1139 |
// Initialize the app when DOM is loaded
|
prompts.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
Design and implement a full-stack intelligent document automation system functionally analogous to the application available at git clone https://huggingface.co/spaces/seamoors/wealthsync-ad. The application shall enable users to upload a fully completed invoice, form, or PDF document, which will then be programmatically parsed using a suite of Python-based OCR (e.g., Tesseract), PDF parsing (e.g., PyMuPDF, pdfplumber, or PyPDF2), and machine learning techniques for semantic field recognition and structured data extraction. The core functionality must support: Field Extraction & Analysis: Upon uploading a completed form, the application must utilize OCR, visual layout analysis, and NLP-driven entity recognition to extract both labeled and inferred field data with high accuracy. All extracted values should be normalized into a unified schema for downstream processing. Template-Agnostic Form Filling: The user will subsequently upload a second PDF — a blank or partially completed template form that may differ in structure or layout from the original. The system must intelligently map and propagate values from the original form to this new document, using semantic similarity (e.g., cosine similarity on embedded field names, BERT-based field name matching, or fuzzy logic heuristics). It must robustly handle field remapping, layout discrepancies, and varying form structures. Output Formats & Data Export: Render a real-time, user-visible preview of the completed PDF document within the UI. Enable export in three formats: A downloadable filled-in PDF document. A structured and semantically accurate JSON representation of the data. A CSV file conforming to normalized tabular output specifications. Architecture & Stack Requirements: Backend: Python with FastAPI or Flask. Frontend: React or Streamlit (if rapid prototyping is preferred). ML/AI: Integrate document layout models (e.g., LayoutLMv3), OCR engines (e.g., Tesseract or EasyOCR), and optional fine-tuned transformer models for field matching. Data Persistence: Optional use of a document database (e.g., MongoDB) for session persistence or audit logs. Include robust error handling, field confidence scoring, and preview customization. Deliverables: Fully functional application with modular, maintainable code. Inline documentation for all components. README with installation, usage, and architecture overview. Exportable build (e.g., Dockerized container or deployment instructions). Constraints: Must support varying form formats without relying on static template matching. Ensure data privacy and sandboxed file handling. Prioritize high field fidelity, semantic consistency, and UI responsiveness.
|
|
|
|
1 |
+
Design and implement a full-stack intelligent document automation system functionally analogous to the application available at git clone https://huggingface.co/spaces/seamoors/wealthsync-ad. The application shall enable users to upload a fully completed invoice, form, or PDF document, which will then be programmatically parsed using a suite of Python-based OCR (e.g., Tesseract), PDF parsing (e.g., PyMuPDF, pdfplumber, or PyPDF2), and machine learning techniques for semantic field recognition and structured data extraction. The core functionality must support: Field Extraction & Analysis: Upon uploading a completed form, the application must utilize OCR, visual layout analysis, and NLP-driven entity recognition to extract both labeled and inferred field data with high accuracy. All extracted values should be normalized into a unified schema for downstream processing. Template-Agnostic Form Filling: The user will subsequently upload a second PDF — a blank or partially completed template form that may differ in structure or layout from the original. The system must intelligently map and propagate values from the original form to this new document, using semantic similarity (e.g., cosine similarity on embedded field names, BERT-based field name matching, or fuzzy logic heuristics). It must robustly handle field remapping, layout discrepancies, and varying form structures. Output Formats & Data Export: Render a real-time, user-visible preview of the completed PDF document within the UI. Enable export in three formats: A downloadable filled-in PDF document. A structured and semantically accurate JSON representation of the data. A CSV file conforming to normalized tabular output specifications. Architecture & Stack Requirements: Backend: Python with FastAPI or Flask. Frontend: React or Streamlit (if rapid prototyping is preferred). ML/AI: Integrate document layout models (e.g., LayoutLMv3), OCR engines (e.g., Tesseract or EasyOCR), and optional fine-tuned transformer models for field matching. Data Persistence: Optional use of a document database (e.g., MongoDB) for session persistence or audit logs. Include robust error handling, field confidence scoring, and preview customization. Deliverables: Fully functional application with modular, maintainable code. Inline documentation for all components. README with installation, usage, and architecture overview. Exportable build (e.g., Dockerized container or deployment instructions). Constraints: Must support varying form formats without relying on static template matching. Ensure data privacy and sandboxed file handling. Prioritize high field fidelity, semantic consistency, and UI responsiveness.
|
2 |
+
That works - almost, except the upload template step 2 does not activate after uploading a source and the final output is not exporting PDF JSON or CSV with the auto filled fields that I require it to or downloading to the users computer
|