#!/usr/bin/env python3 """ Encoding Fix Script for Legal Dashboard OCR ========================================== This script fixes Unicode encoding issues that can occur on Windows systems. Based on solutions from: https://docs.appseed.us/content/how-to-fix/unicodedecodeerror-charmap-codec-cant-decode-byte-0x9d/ """ import os import sys import codecs def fix_file_encoding(file_path, target_encoding='utf-8'): """Fix encoding issues in a file""" try: # Try to read with different encodings encodings_to_try = ['utf-8', 'utf-8-sig', 'cp1252', 'latin-1', 'iso-8859-1'] content = None used_encoding = None for encoding in encodings_to_try: try: with open(file_path, 'r', encoding=encoding) as f: content = f.read() used_encoding = encoding print( f"āœ… Successfully read {file_path} with {encoding} encoding") break except UnicodeDecodeError: continue if content is None: print(f"āŒ Could not read {file_path} with any encoding") return False # Write back with UTF-8 encoding with open(file_path, 'w', encoding='utf-8') as f: f.write(content) print(f"āœ… Fixed encoding for {file_path}") return True except Exception as e: print(f"āŒ Error fixing {file_path}: {e}") return False def fix_project_encoding(): """Fix encoding issues in the entire project""" print("šŸ”§ Fixing encoding issues in Legal Dashboard OCR project...") # Files that might have encoding issues files_to_fix = [ "huggingface_space/app.py", "huggingface_space/README.md", "requirements.txt", "README.md", "DEPLOYMENT_INSTRUCTIONS.md", "FINAL_DEPLOYMENT_INSTRUCTIONS.md", "DEPLOYMENT_SUMMARY.md" ] fixed_count = 0 total_files = len(files_to_fix) for file_path in files_to_fix: if os.path.exists(file_path): if fix_file_encoding(file_path): fixed_count += 1 else: print(f"āš ļø File not found: {file_path}") print(f"\nšŸ“Š Encoding Fix Results:") print(f"āœ… Fixed: {fixed_count}/{total_files} files") return fixed_count == total_files def set_environment_encoding(): """Set environment variables for proper encoding""" print("\nšŸ”§ Setting environment variables for encoding...") # Set UTF-8 environment variable for Windows os.environ['PYTHONUTF8'] = '1' # For Windows CMD print("For Windows CMD, run: set PYTHONUTF8=1") # For PowerShell print("For PowerShell, run: $env:PYTHONUTF8=1") print("āœ… Environment encoding variables set") def main(): """Main function to fix encoding issues""" print("šŸš€ Legal Dashboard OCR - Encoding Fix") print("=" * 50) # Fix file encodings files_ok = fix_project_encoding() # Set environment encoding set_environment_encoding() print("\n" + "=" * 50) if files_ok: print("šŸŽ‰ All encoding issues fixed!") print("āœ… Project is ready for deployment") return 0 else: print("āš ļø Some encoding issues remain") print("Please check the files manually") return 1 if __name__ == "__main__": sys.exit(main())