File size: 6,183 Bytes
922c3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python3
"""

Security Check Script for Legal Dashboard OCR

============================================



This script checks for hardcoded secrets, tokens, and API keys in the codebase.

Based on security best practices from GitGuardian and Hugging Face documentation.

"""

import os
import re
import sys
from pathlib import Path


def check_for_hardcoded_secrets():
    """Check for hardcoded secrets in the codebase"""
    print("πŸ”’ Security Check - Looking for hardcoded secrets...")

    # Patterns to look for
    secret_patterns = [
        r'hf_[a-zA-Z0-9]{20,}',  # Hugging Face tokens
        r'sk-[a-zA-Z0-9]{20,}',  # OpenAI API keys
        r'pk_[a-zA-Z0-9]{20,}',  # Stripe public keys
        r'sk_[a-zA-Z0-9]{20,}',  # Stripe secret keys
        r'AKIA[0-9A-Z]{16}',     # AWS access keys
        r'[0-9a-zA-Z/+]{40}',    # AWS secret keys
        r'ghp_[a-zA-Z0-9]{36}',  # GitHub personal access tokens
        r'gho_[a-zA-Z0-9]{36}',  # GitHub OAuth tokens
        r'ghu_[a-zA-Z0-9]{36}',  # GitHub user-to-server tokens
        r'ghs_[a-zA-Z0-9]{36}',  # GitHub server-to-server tokens
        r'ghr_[a-zA-Z0-9]{36}',  # GitHub refresh tokens
    ]

    # Files to check
    files_to_check = [
        "app/services/ocr_service.py",
        "app/services/ai_service.py",
        "app/services/database_service.py",
        "app/main.py",
        "huggingface_space/app.py",
        "requirements.txt",
        "README.md"
    ]

    found_secrets = []

    for file_path in files_to_check:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                for pattern in secret_patterns:
                    matches = re.findall(pattern, content)
                    if matches:
                        found_secrets.append({
                            'file': file_path,
                            'pattern': pattern,
                            'matches': matches
                        })

            except Exception as e:
                print(f"⚠️ Error reading {file_path}: {e}")

    return found_secrets


def check_environment_variables():
    """Check if environment variables are properly used"""
    print("\nπŸ” Checking environment variable usage...")

    env_vars_to_check = [
        "HF_TOKEN",
        "OPENAI_API_KEY",
        "DATABASE_URL",
        "SECRET_KEY"
    ]

    proper_usage = True

    for var in env_vars_to_check:
        if os.getenv(var):
            print(f"βœ… {var} is set in environment")
        else:
            print(
                f"⚠️ {var} not found in environment (this is OK for development)")

    return proper_usage


def check_gitignore():
    """Check if sensitive files are properly ignored"""
    print("\nπŸ“ Checking .gitignore for sensitive files...")

    sensitive_files = [
        ".env",
        "*.key",
        "*.pem",
        "secrets.json",
        "config.json"
    ]

    gitignore_content = ""
    if os.path.exists(".gitignore"):
        with open(".gitignore", 'r') as f:
            gitignore_content = f.read()

    missing_entries = []
    for file_pattern in sensitive_files:
        if file_pattern not in gitignore_content:
            missing_entries.append(file_pattern)

    if missing_entries:
        print(f"⚠️ Missing from .gitignore: {missing_entries}")
        return False
    else:
        print("βœ… .gitignore properly configured")
        return True


def generate_security_report(found_secrets):
    """Generate security report"""
    print("\nπŸ“Š Security Check Report")
    print("=" * 50)

    if found_secrets:
        print("❌ HARDCODED SECRETS FOUND:")
        for secret in found_secrets:
            print(f"  File: {secret['file']}")
            print(f"  Pattern: {secret['pattern']}")
            print(f"  Matches: {len(secret['matches'])} found")
            print("  ---")
        return False
    else:
        print("βœ… No hardcoded secrets found!")
        return True


def provide_remediation_advice():
    """Provide advice for fixing security issues"""
    print("\nπŸ”§ Security Remediation Advice")
    print("=" * 40)

    print("1. **Remove Hardcoded Tokens**:")
    print("   - Replace hardcoded tokens with environment variables")
    print("   - Use os.getenv() to read from environment")
    print("   - Set tokens in Hugging Face Space settings")

    print("\n2. **Environment Variables**:")
    print("   - Set HF_TOKEN in your Space settings")
    print("   - Use .env files for local development")
    print("   - Never commit .env files to version control")

    print("\n3. **Git Security**:")
    print("   - Add sensitive files to .gitignore")
    print("   - Use git-secrets for pre-commit hooks")
    print("   - Regularly audit your repository")

    print("\n4. **Hugging Face Best Practices**:")
    print("   - Use Space secrets for sensitive data")
    print("   - Keep tokens private and rotate regularly")
    print("   - Monitor token usage and permissions")


def main():
    """Main security check function"""
    print("πŸ”’ Legal Dashboard OCR - Security Check")
    print("=" * 50)

    # Check for hardcoded secrets
    found_secrets = check_for_hardcoded_secrets()

    # Check environment variables
    env_ok = check_environment_variables()

    # Check gitignore
    gitignore_ok = check_gitignore()

    # Generate report
    secrets_ok = generate_security_report(found_secrets)

    # Final result
    print("\n" + "=" * 50)
    if secrets_ok and env_ok and gitignore_ok:
        print("πŸŽ‰ Security check passed!")
        print("βœ… No hardcoded secrets found")
        print("βœ… Environment variables properly configured")
        print("βœ… Git security measures in place")
        return 0
    else:
        print("⚠️ Security issues found!")
        provide_remediation_advice()
        return 1


if __name__ == "__main__":
    sys.exit(main())