sandylolpotty commited on
Commit
7d2e5d0
Β·
verified Β·
1 Parent(s): 1f06f2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1164 -0
app.py ADDED
@@ -0,0 +1,1164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import json
5
+ import tempfile
6
+ import hashlib
7
+ from pathlib import Path
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple, Optional, Union
10
+ import logging
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Optional imports for document processing
17
+ try:
18
+ from docx import Document
19
+ DOCX_AVAILABLE = True
20
+ except ImportError:
21
+ DOCX_AVAILABLE = False
22
+ logger.warning("python-docx not installed. DOCX processing will be disabled.")
23
+
24
+ try:
25
+ import PyPDF2
26
+ PDF_AVAILABLE = True
27
+ except ImportError:
28
+ PDF_AVAILABLE = False
29
+ logger.warning("PyPDF2 not installed. PDF processing will be disabled.")
30
+
31
+ try:
32
+ import fitz # PyMuPDF - alternative PDF processor
33
+ PYMUPDF_AVAILABLE = True
34
+ except ImportError:
35
+ PYMUPDF_AVAILABLE = False
36
+
37
+ # Optional imports for advanced text processing
38
+ try:
39
+ import nltk
40
+ from nltk.tokenize import sent_tokenize, word_tokenize
41
+ from nltk.corpus import stopwords
42
+ from nltk.frequency import FreqDist
43
+ from nltk.sentiment import SentimentIntensityAnalyzer
44
+ NLTK_AVAILABLE = True
45
+ # Download required NLTK data
46
+ required_nltk_data = ['punkt', 'stopwords', 'vader_lexicon']
47
+ for data_name in required_nltk_data:
48
+ try:
49
+ if data_name == 'punkt':
50
+ nltk.data.find('tokenizers/punkt')
51
+ elif data_name == 'stopwords':
52
+ nltk.data.find('corpora/stopwords')
53
+ elif data_name == 'vader_lexicon':
54
+ nltk.data.find('vader_lexicon')
55
+ except LookupError:
56
+ nltk.download(data_name, quiet=True)
57
+ except ImportError:
58
+ NLTK_AVAILABLE = False
59
+ logger.warning("NLTK not installed. Advanced text analysis will be limited.")
60
+
61
+ try:
62
+ from transformers import pipeline
63
+ import torch
64
+ TRANSFORMERS_AVAILABLE = True
65
+ DEVICE = 0 if torch.cuda.is_available() else -1
66
+ except ImportError:
67
+ TRANSFORMERS_AVAILABLE = False
68
+ DEVICE = -1
69
+ logger.warning("transformers not installed. AI summarization will use basic extraction methods.")
70
+
71
+ class AdvancedDocumentSummarizer:
72
+ """CatalystGPT-4 Advanced Document Summarizer with enhanced features"""
73
+
74
+ def __init__(self):
75
+ self.summarizer = None
76
+ self.sentiment_analyzer = None
77
+ self.cache = {}
78
+
79
+ # Initialize AI models
80
+ if TRANSFORMERS_AVAILABLE:
81
+ self._initialize_ai_models()
82
+
83
+ # Initialize sentiment analyzer
84
+ if NLTK_AVAILABLE:
85
+ try:
86
+ self.sentiment_analyzer = SentimentIntensityAnalyzer()
87
+ except Exception as e:
88
+ logger.warning(f"Failed to initialize sentiment analyzer: {e}")
89
+
90
+ def _initialize_ai_models(self):
91
+ """Initialize AI models with error handling and fallbacks"""
92
+ models_to_try = [
93
+ "facebook/bart-large-cnn",
94
+ "t5-small",
95
+ "google/pegasus-xsum"
96
+ ]
97
+
98
+ for model_name in models_to_try:
99
+ try:
100
+ self.summarizer = pipeline(
101
+ "summarization",
102
+ model=model_name,
103
+ device=DEVICE,
104
+ torch_dtype=torch.float16 if DEVICE >= 0 else torch.float32
105
+ )
106
+ logger.info(f"Successfully loaded {model_name}")
107
+ break
108
+ except Exception as e:
109
+ logger.warning(f"Failed to load {model_name}: {e}")
110
+ continue
111
+
112
+ def _get_file_hash(self, file_path: str) -> str:
113
+ """Generate hash for file caching"""
114
+ try:
115
+ with open(file_path, 'rb') as f:
116
+ content = f.read()
117
+ return hashlib.md5(content).hexdigest()
118
+ except Exception:
119
+ return str(datetime.now().timestamp())
120
+
121
+ def extract_text_from_pdf(self, file_path: str) -> str:
122
+ """Enhanced PDF text extraction with better error handling"""
123
+ text = ""
124
+
125
+ # Try PyMuPDF first (generally better)
126
+ if PYMUPDF_AVAILABLE:
127
+ try:
128
+ doc = fitz.open(file_path)
129
+ for page_num, page in enumerate(doc):
130
+ page_text = page.get_text()
131
+ if page_text.strip(): # Only add non-empty pages
132
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
133
+ doc.close()
134
+
135
+ if text.strip():
136
+ return text
137
+ except Exception as e:
138
+ logger.error(f"PyMuPDF extraction failed: {e}")
139
+
140
+ # Fallback to PyPDF2
141
+ if PDF_AVAILABLE:
142
+ try:
143
+ with open(file_path, 'rb') as file:
144
+ pdf_reader = PyPDF2.PdfReader(file)
145
+ for page_num, page in enumerate(pdf_reader.pages):
146
+ page_text = page.extract_text()
147
+ if page_text.strip():
148
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
149
+
150
+ if text.strip():
151
+ return text
152
+ except Exception as e:
153
+ logger.error(f"PyPDF2 extraction failed: {e}")
154
+
155
+ return "PDF processing libraries not available or extraction failed."
156
+
157
+ def extract_text_from_docx(self, file_path: str) -> str:
158
+ """Enhanced DOCX extraction with better formatting preservation"""
159
+ if not DOCX_AVAILABLE:
160
+ return "python-docx library not available."
161
+
162
+ try:
163
+ doc = Document(file_path)
164
+ text_parts = []
165
+
166
+ # Extract paragraphs
167
+ for paragraph in doc.paragraphs:
168
+ if paragraph.text.strip():
169
+ text_parts.append(paragraph.text)
170
+
171
+ # Extract tables
172
+ for table_num, table in enumerate(doc.tables):
173
+ text_parts.append(f"\n--- Table {table_num + 1} ---")
174
+ for row in table.rows:
175
+ row_text = " | ".join(cell.text.strip() for cell in row.cells)
176
+ if row_text.strip():
177
+ text_parts.append(row_text)
178
+
179
+ return "\n".join(text_parts)
180
+ except Exception as e:
181
+ logger.error(f"Error processing DOCX file: {e}")
182
+ return f"Error processing DOCX file: {str(e)}"
183
+
184
+ def get_enhanced_document_stats(self, text: str) -> Dict:
185
+ """Get comprehensive document statistics with sentiment analysis"""
186
+ if not text.strip():
187
+ return {}
188
+
189
+ # Basic stats
190
+ word_count = len(text.split())
191
+ char_count = len(text)
192
+ char_count_no_spaces = len(text.replace(' ', ''))
193
+ paragraph_count = len([p for p in text.split('\n\n') if p.strip()])
194
+
195
+ stats = {
196
+ 'word_count': word_count,
197
+ 'character_count': char_count,
198
+ 'character_count_no_spaces': char_count_no_spaces,
199
+ 'paragraph_count': paragraph_count,
200
+ 'estimated_reading_time': max(1, round(word_count / 200)), # 200 WPM average
201
+ 'estimated_speaking_time': max(1, round(word_count / 150)) # 150 WPM speaking
202
+ }
203
+
204
+ if NLTK_AVAILABLE:
205
+ sentences = sent_tokenize(text)
206
+ stats['sentence_count'] = len(sentences)
207
+ stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0
208
+
209
+ # Word frequency analysis
210
+ words = word_tokenize(text.lower())
211
+ stop_words = set(stopwords.words('english'))
212
+ filtered_words = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 2]
213
+
214
+ if filtered_words:
215
+ freq_dist = FreqDist(filtered_words)
216
+ stats['top_words'] = freq_dist.most_common(15)
217
+ stats['unique_words'] = len(set(filtered_words))
218
+ stats['lexical_diversity'] = round(len(set(filtered_words)) / len(filtered_words), 3) if filtered_words else 0
219
+
220
+ # Sentiment analysis
221
+ if self.sentiment_analyzer:
222
+ try:
223
+ sentiment_scores = self.sentiment_analyzer.polarity_scores(text[:5000]) # Limit for performance
224
+ stats['sentiment'] = {
225
+ 'compound': round(sentiment_scores['compound'], 3),
226
+ 'positive': round(sentiment_scores['pos'], 3),
227
+ 'negative': round(sentiment_scores['neg'], 3),
228
+ 'neutral': round(sentiment_scores['neu'], 3)
229
+ }
230
+ except Exception as e:
231
+ logger.error(f"Sentiment analysis failed: {e}")
232
+ else:
233
+ # Fallback without NLTK
234
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
235
+ stats['sentence_count'] = len(sentences)
236
+ stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0
237
+
238
+ words = re.findall(r'\b\w+\b', text.lower())
239
+ word_freq = {}
240
+ for word in words:
241
+ if len(word) > 2:
242
+ word_freq[word] = word_freq.get(word, 0) + 1
243
+
244
+ stats['top_words'] = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15]
245
+ stats['unique_words'] = len(set(words))
246
+
247
+ return stats
248
+
249
+ def advanced_extractive_summary(self, text: str, num_sentences: int = 3) -> str:
250
+ """Enhanced extractive summarization with improved sentence scoring"""
251
+ if not text.strip():
252
+ return "No text to summarize."
253
+
254
+ if NLTK_AVAILABLE:
255
+ sentences = sent_tokenize(text)
256
+ else:
257
+ sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
258
+
259
+ if len(sentences) <= num_sentences:
260
+ return text
261
+
262
+ # Enhanced sentence scoring
263
+ scored_sentences = []
264
+ total_sentences = len(sentences)
265
+
266
+ # Calculate word frequencies for TF scoring
267
+ all_words = re.findall(r'\b\w+\b', text.lower())
268
+ word_freq = {}
269
+ for word in all_words:
270
+ if len(word) > 2:
271
+ word_freq[word] = word_freq.get(word, 0) + 1
272
+
273
+ # Important keywords that boost sentence scores
274
+ importance_keywords = [
275
+ 'conclusion', 'summary', 'result', 'finding', 'important', 'significant',
276
+ 'key', 'main', 'primary', 'essential', 'crucial', 'objective', 'goal',
277
+ 'recommendation', 'suggest', 'propose', 'indicate', 'show', 'demonstrate'
278
+ ]
279
+
280
+ for i, sentence in enumerate(sentences):
281
+ if len(sentence.split()) < 5: # Skip very short sentences
282
+ continue
283
+
284
+ score = 0
285
+ sentence_lower = sentence.lower()
286
+ sentence_words = sentence.split()
287
+
288
+ # Position scoring (beginning and end are more important)
289
+ if i < total_sentences * 0.15: # First 15%
290
+ score += 3
291
+ elif i > total_sentences * 0.85: # Last 15%
292
+ score += 2
293
+ elif total_sentences * 0.4 <= i <= total_sentences * 0.6: # Middle section
294
+ score += 1
295
+
296
+ # Length scoring (prefer moderate length)
297
+ word_count = len(sentence_words)
298
+ if 12 <= word_count <= 25:
299
+ score += 3
300
+ elif 8 <= word_count <= 35:
301
+ score += 2
302
+ elif 5 <= word_count <= 45:
303
+ score += 1
304
+
305
+ # Keyword importance scoring
306
+ keyword_score = sum(2 if keyword in sentence_lower else 0 for keyword in importance_keywords)
307
+ score += min(keyword_score, 6) # Cap keyword bonus
308
+
309
+ # TF-based scoring (frequency of important words)
310
+ tf_score = 0
311
+ for word in sentence_words:
312
+ word_lower = word.lower()
313
+ if word_lower in word_freq and len(word_lower) > 3:
314
+ tf_score += min(word_freq[word_lower], 5) # Cap individual word contribution
315
+ score += min(tf_score / len(sentence_words), 3) # Normalize by sentence length
316
+
317
+ # Structural indicators
318
+ if any(indicator in sentence for indicator in [':', 'β€”', '"', '(']):
319
+ score += 1
320
+
321
+ # Numerical data (often important)
322
+ if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence):
323
+ score += 1
324
+
325
+ scored_sentences.append((sentence, score, i))
326
+
327
+ # Sort by score and select top sentences
328
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
329
+ selected_sentences = scored_sentences[:num_sentences]
330
+
331
+ # Sort selected sentences by original position to maintain flow
332
+ selected_sentences.sort(key=lambda x: x[2])
333
+
334
+ return ' '.join([s[0] for s in selected_sentences])
335
+
336
+ def intelligent_chunking(self, text: str, max_chunk_size: int = 1024) -> List[str]:
337
+ """Intelligently chunk text while preserving semantic boundaries"""
338
+ if len(text) <= max_chunk_size:
339
+ return [text]
340
+
341
+ chunks = []
342
+
343
+ # Try to split by double newlines first (paragraphs)
344
+ paragraphs = text.split('\n\n')
345
+ current_chunk = ""
346
+
347
+ for paragraph in paragraphs:
348
+ # If single paragraph is too long, split by sentences
349
+ if len(paragraph) > max_chunk_size:
350
+ if current_chunk:
351
+ chunks.append(current_chunk.strip())
352
+ current_chunk = ""
353
+
354
+ # Split long paragraph by sentences
355
+ if NLTK_AVAILABLE:
356
+ sentences = sent_tokenize(paragraph)
357
+ else:
358
+ sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
359
+
360
+ temp_chunk = ""
361
+ for sentence in sentences:
362
+ if len(temp_chunk + sentence) <= max_chunk_size:
363
+ temp_chunk += sentence + ". "
364
+ else:
365
+ if temp_chunk:
366
+ chunks.append(temp_chunk.strip())
367
+ temp_chunk = sentence + ". "
368
+
369
+ if temp_chunk:
370
+ current_chunk = temp_chunk
371
+ else:
372
+ # Normal paragraph processing
373
+ if len(current_chunk + paragraph) <= max_chunk_size:
374
+ current_chunk += paragraph + "\n\n"
375
+ else:
376
+ if current_chunk:
377
+ chunks.append(current_chunk.strip())
378
+ current_chunk = paragraph + "\n\n"
379
+
380
+ if current_chunk:
381
+ chunks.append(current_chunk.strip())
382
+
383
+ return [chunk for chunk in chunks if chunk.strip()]
384
+
385
+ def ai_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str:
386
+ """Enhanced AI-powered summarization with better chunking and error handling"""
387
+ if not self.summarizer:
388
+ return self.advanced_extractive_summary(text)
389
+
390
+ try:
391
+ # Intelligent chunking
392
+ chunks = self.intelligent_chunking(text, 1000) # Slightly smaller chunks for better quality
393
+
394
+ if not chunks:
395
+ return "No meaningful content found for summarization."
396
+
397
+ summaries = []
398
+ for i, chunk in enumerate(chunks):
399
+ if len(chunk.strip()) < 50: # Skip very short chunks
400
+ continue
401
+
402
+ try:
403
+ # Adjust parameters based on chunk size
404
+ chunk_max_length = min(max_length, max(50, len(chunk.split()) // 3))
405
+ chunk_min_length = min(min_length, chunk_max_length // 2)
406
+
407
+ summary = self.summarizer(
408
+ chunk,
409
+ max_length=chunk_max_length,
410
+ min_length=chunk_min_length,
411
+ do_sample=False,
412
+ truncation=True
413
+ )
414
+ summaries.append(summary[0]['summary_text'])
415
+
416
+ except Exception as e:
417
+ logger.warning(f"Error summarizing chunk {i}: {e}")
418
+ # Fallback to extractive summary for this chunk
419
+ fallback_summary = self.advanced_extractive_summary(chunk, 2)
420
+ if fallback_summary and fallback_summary != "No text to summarize.":
421
+ summaries.append(fallback_summary)
422
+
423
+ if not summaries:
424
+ return self.advanced_extractive_summary(text)
425
+
426
+ # Combine and refine summaries
427
+ if len(summaries) == 1:
428
+ return summaries[0]
429
+ else:
430
+ combined_summary = ' '.join(summaries)
431
+
432
+ # If combined summary is still too long, summarize again
433
+ if len(combined_summary.split()) > max_length * 1.5:
434
+ try:
435
+ final_summary = self.summarizer(
436
+ combined_summary,
437
+ max_length=max_length,
438
+ min_length=min_length,
439
+ do_sample=False,
440
+ truncation=True
441
+ )
442
+ return final_summary[0]['summary_text']
443
+ except Exception:
444
+ return combined_summary[:max_length * 10] # Rough character limit fallback
445
+
446
+ return combined_summary
447
+
448
+ except Exception as e:
449
+ logger.error(f"AI summarization failed: {e}")
450
+ return self.advanced_extractive_summary(text)
451
+
452
+ def generate_enhanced_key_points(self, text: str, num_points: int = 7) -> List[str]:
453
+ """Generate key points with improved extraction and categorization"""
454
+ if not text.strip():
455
+ return []
456
+
457
+ if NLTK_AVAILABLE:
458
+ sentences = sent_tokenize(text)
459
+ else:
460
+ sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
461
+
462
+ # Enhanced key point indicators with categories
463
+ key_indicators = {
464
+ 'conclusions': ['conclusion', 'conclude', 'result', 'outcome', 'finding', 'discovered'],
465
+ 'objectives': ['objective', 'goal', 'purpose', 'aim', 'target', 'mission'],
466
+ 'methods': ['method', 'approach', 'technique', 'procedure', 'process', 'way'],
467
+ 'importance': ['important', 'significant', 'crucial', 'essential', 'key', 'main', 'primary'],
468
+ 'recommendations': ['recommend', 'suggest', 'propose', 'should', 'must', 'need to'],
469
+ 'problems': ['problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'concern'],
470
+ 'benefits': ['benefit', 'advantage', 'improvement', 'enhancement', 'positive', 'gain']
471
+ }
472
+
473
+ scored_sentences = []
474
+ for sentence in sentences:
475
+ if len(sentence.split()) < 6: # Skip very short sentences
476
+ continue
477
+
478
+ score = 0
479
+ sentence_lower = sentence.lower()
480
+ category = 'general'
481
+
482
+ # Category-based scoring
483
+ for cat, indicators in key_indicators.items():
484
+ category_score = sum(2 if indicator in sentence_lower else 0 for indicator in indicators)
485
+ if category_score > score:
486
+ score = category_score
487
+ category = cat
488
+
489
+ # Structural scoring
490
+ if sentence.strip().startswith(('β€’', '-', '1.', '2.', '3.', '4.', '5.')):
491
+ score += 4
492
+
493
+ # Punctuation indicators
494
+ if any(punct in sentence for punct in [':', ';', 'β€”', '"']):
495
+ score += 1
496
+
497
+ # Length scoring (prefer moderate length for key points)
498
+ word_count = len(sentence.split())
499
+ if 8 <= word_count <= 20:
500
+ score += 3
501
+ elif 6 <= word_count <= 30:
502
+ score += 2
503
+ elif 4 <= word_count <= 40:
504
+ score += 1
505
+
506
+ # Numerical data bonus
507
+ if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence):
508
+ score += 2
509
+
510
+ # Avoid very generic sentences
511
+ generic_words = ['the', 'this', 'that', 'there', 'it', 'they']
512
+ if sentence.split()[0].lower() in generic_words:
513
+ score -= 1
514
+
515
+ if score > 0:
516
+ scored_sentences.append((sentence.strip(), score, category))
517
+
518
+ # Sort by score and diversify by category
519
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
520
+
521
+ # Select diverse key points
522
+ selected_points = []
523
+ used_categories = set()
524
+
525
+ # First pass: get the highest scoring point from each category
526
+ for sentence, score, category in scored_sentences:
527
+ if len(selected_points) >= num_points:
528
+ break
529
+ if category not in used_categories:
530
+ selected_points.append(sentence)
531
+ used_categories.add(category)
532
+
533
+ # Second pass: fill remaining slots with highest scoring sentences
534
+ for sentence, score, category in scored_sentences:
535
+ if len(selected_points) >= num_points:
536
+ break
537
+ if sentence not in selected_points:
538
+ selected_points.append(sentence)
539
+
540
+ return selected_points[:num_points]
541
+
542
+ def generate_document_outline(self, text: str) -> List[str]:
543
+ """Generate a structured outline of the document"""
544
+ if not text.strip():
545
+ return []
546
+
547
+ lines = text.split('\n')
548
+ outline = []
549
+
550
+ # Look for headers, numbered sections, etc.
551
+ header_patterns = [
552
+ r'^#{1,6}\s+(.+)$', # Markdown headers
553
+ r'^(\d+\.?\s+[A-Z][^.]{10,})$', # Numbered sections
554
+ r'^([A-Z][A-Z\s]{5,})$', # ALL CAPS headers
555
+ r'^([A-Z][a-z\s]{10,}:)$', # Title Case with colon
556
+ ]
557
+
558
+ for line in lines:
559
+ line = line.strip()
560
+ if not line:
561
+ continue
562
+
563
+ for pattern in header_patterns:
564
+ match = re.match(pattern, line)
565
+ if match:
566
+ outline.append(match.group(1).strip())
567
+ break
568
+
569
+ return outline[:10] # Limit to 10 outline items
570
+
571
+ def process_document(self, file_path: str, summary_type: str = "ai",
572
+ summary_length: str = "medium") -> Tuple[Optional[Dict], Optional[str]]:
573
+ """Enhanced document processing with caching and comprehensive analysis"""
574
+ if not file_path:
575
+ return None, "No file provided."
576
+
577
+ try:
578
+ # Check cache
579
+ file_hash = self._get_file_hash(file_path)
580
+ cache_key = f"{file_hash}_{summary_type}_{summary_length}"
581
+
582
+ if cache_key in self.cache:
583
+ logger.info("Returning cached result")
584
+ return self.cache[cache_key], None
585
+
586
+ # Extract text based on file type
587
+ file_extension = Path(file_path).suffix.lower()
588
+
589
+ if file_extension == '.pdf':
590
+ text = self.extract_text_from_pdf(file_path)
591
+ elif file_extension == '.docx':
592
+ text = self.extract_text_from_docx(file_path)
593
+ elif file_extension in ['.txt', '.md', '.rtf']:
594
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
595
+ text = f.read()
596
+ else:
597
+ return None, f"Unsupported file type: {file_extension}"
598
+
599
+ if not text.strip() or "not available" in text.lower():
600
+ return None, "No text could be extracted from the document or extraction failed."
601
+
602
+ # Clean text
603
+ text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines
604
+ text = re.sub(r' {2,}', ' ', text) # Reduce excessive spaces
605
+
606
+ # Get comprehensive statistics
607
+ stats = self.get_enhanced_document_stats(text)
608
+
609
+ # Generate summary based on type and length
610
+ length_params = {
611
+ "short": {"sentences": 2, "max_length": 80, "min_length": 30},
612
+ "medium": {"sentences": 4, "max_length": 150, "min_length": 50},
613
+ "long": {"sentences": 6, "max_length": 250, "min_length": 100},
614
+ "detailed": {"sentences": 8, "max_length": 400, "min_length": 150}
615
+ }
616
+
617
+ params = length_params.get(summary_length, length_params["medium"])
618
+
619
+ # Generate summary
620
+ if summary_type == "ai" and self.summarizer:
621
+ summary = self.ai_summary(text, params["max_length"], params["min_length"])
622
+ else:
623
+ summary = self.advanced_extractive_summary(text, params["sentences"])
624
+
625
+ # Generate enhanced features
626
+ key_points = self.generate_enhanced_key_points(text, 7)
627
+ outline = self.generate_document_outline(text)
628
+
629
+ # Calculate readability (simple approximation)
630
+ avg_sentence_length = stats.get('avg_sentence_length', 0)
631
+ readability_score = max(0, min(100, 100 - (avg_sentence_length * 2)))
632
+
633
+ result = {
634
+ 'original_text': text[:2000] + "..." if len(text) > 2000 else text, # Truncate for display
635
+ 'full_text_length': len(text),
636
+ 'summary': summary,
637
+ 'key_points': key_points,
638
+ 'outline': outline,
639
+ 'stats': stats,
640
+ 'readability_score': readability_score,
641
+ 'file_name': Path(file_path).name,
642
+ 'file_size': os.path.getsize(file_path),
643
+ 'processing_time': datetime.now().isoformat(),
644
+ 'summary_type': summary_type,
645
+ 'summary_length': summary_length,
646
+ 'model_used': 'AI (BART/T5)' if self.summarizer else 'Extractive'
647
+ }
648
+
649
+ # Cache result
650
+ self.cache[cache_key] = result
651
+
652
+ return result, None
653
+
654
+ except Exception as e:
655
+ logger.error(f"Document processing error: {e}")
656
+ return None, f"Error processing document: {str(e)}"
657
+
658
+ def create_catalyst_interface():
659
+ """Create the CatalystGPT-4 document summarizer interface"""
660
+
661
+ summarizer = AdvancedDocumentSummarizer()
662
+
663
+ # Enhanced CSS with modern styling
664
+ css = """
665
+ .catalyst-header {
666
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
667
+ color: white;
668
+ padding: 30px;
669
+ border-radius: 20px;
670
+ text-align: center;
671
+ margin-bottom: 25px;
672
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
673
+ }
674
+
675
+ .summary-container {
676
+ background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
677
+ color: white;
678
+ padding: 25px;
679
+ border-radius: 15px;
680
+ margin: 15px 0;
681
+ box-shadow: 0 8px 25px rgba(0,0,0,0.15);
682
+ }
683
+
684
+ .stats-container {
685
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
686
+ color: white;
687
+ padding: 20px;
688
+ border-radius: 12px;
689
+ margin: 15px 0;
690
+ box-shadow: 0 6px 20px rgba(0,0,0,0.1);
691
+ }
692
+
693
+ .key-points-container {
694
+ background: linear-gradient(135deg, #4ecdc4 0%, #44a08d 100%);
695
+ color: white;
696
+ padding: 20px;
697
+ border-radius: 12px;
698
+ margin: 15px 0;
699
+ box-shadow: 0 6px 20px rgba(0,0,0,0.1);
700
+ }
701
+
702
+ .outline-container {
703
+ background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
704
+ color: white;
705
+ padding: 20px;
706
+ border-radius: 12px;
707
+ margin: 15px 0;
708
+ box-shadow: 0 6px 20px rgba(0,0,0,0.1);
709
+ }
710
+
711
+ .error-container {
712
+ background: linear-gradient(135deg, #ff9a9e 0%, #fecfef 100%);
713
+ color: #721c24;
714
+ padding: 20px;
715
+ border-radius: 12px;
716
+ margin: 15px 0;
717
+ border-left: 5px solid #dc3545;
718
+ }
719
+
720
+ .control-panel {
721
+ background: linear-gradient(135deg, #f6f9fc 0%, #e9ecef 100%);
722
+ padding: 25px;
723
+ border-radius: 15px;
724
+ margin: 15px 0;
725
+ border: 1px solid #dee2e6;
726
+ box-shadow: 0 4px 15px rgba(0,0,0,0.05);
727
+ }
728
+
729
+ .file-upload-area {
730
+ border: 3px dashed #007bff;
731
+ border-radius: 15px;
732
+ padding: 40px;
733
+ text-align: center;
734
+ background: linear-gradient(135deg, #f8f9ff 0%, #e3f2fd 100%);
735
+ transition: all 0.3s ease;
736
+ margin: 15px 0;
737
+ }
738
+
739
+ .file-upload-area:hover {
740
+ border-color: #0056b3;
741
+ background: linear-gradient(135deg, #f0f7ff 0%, #e1f5fe 100%);
742
+ transform: translateY(-2px);
743
+ }
744
+
745
+ .metric-card {
746
+ background: white;
747
+ padding: 15px;
748
+ border-radius: 10px;
749
+ margin: 5px;
750
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
751
+ text-align: center;
752
+ }
753
+
754
+ .sentiment-indicator {
755
+ display: inline-block;
756
+ padding: 5px 12px;
757
+ border-radius: 20px;
758
+ font-weight: bold;
759
+ font-size: 12px;
760
+ margin: 2px;
761
+ }
762
+
763
+ .sentiment-positive { background: #d4edda; color: #155724; }
764
+ .sentiment-negative { background: #f8d7da; color: #721c24; }
765
+ .sentiment-neutral { background: #d1ecf1; color: #0c5460; }
766
+
767
+ .progress-bar {
768
+ background: #e9ecef;
769
+ border-radius: 10px;
770
+ overflow: hidden;
771
+ height: 8px;
772
+ margin: 5px 0;
773
+ }
774
+
775
+ .progress-fill {
776
+ height: 100%;
777
+ background: linear-gradient(90deg, #28a745, #20c997);
778
+ transition: width 0.3s ease;
779
+ }
780
+ """
781
+
782
+ def format_file_size(size_bytes):
783
+ """Convert bytes to human readable format"""
784
+ for unit in ['B', 'KB', 'MB', 'GB']:
785
+ if size_bytes < 1024.0:
786
+ return f"{size_bytes:.1f} {unit}"
787
+ size_bytes /= 1024.0
788
+ return f"{size_bytes:.1f} TB"
789
+
790
+ def get_sentiment_indicator(sentiment_score):
791
+ """Get sentiment indicator HTML"""
792
+ if sentiment_score > 0.1:
793
+ return '<span class="sentiment-indicator sentiment-positive">😊 Positive</span>'
794
+ elif sentiment_score < -0.1:
795
+ return '<span class="sentiment-indicator sentiment-negative">πŸ˜” Negative</span>'
796
+ else:
797
+ return '<span class="sentiment-indicator sentiment-neutral">😐 Neutral</span>'
798
+
799
+ def process_and_display(file, summary_type, summary_length, enable_ai_features):
800
+ """Enhanced processing with comprehensive results display"""
801
+ if file is None:
802
+ return (
803
+ gr.update(visible=False),
804
+ gr.update(visible=False),
805
+ gr.update(visible=False),
806
+ gr.update(visible=False),
807
+ gr.update(value="""
808
+ <div style="text-align: center; padding: 60px; color: #666;">
809
+ <h3>πŸš€ CatalystGPT-4 Ready</h3>
810
+ <p>Upload a document to begin advanced AI-powered analysis</p>
811
+ <p><small>Supports: PDF, Word (.docx), Text (.txt, .md, .rtf)</small></p>
812
+ </div>
813
+ """, visible=True)
814
+ )
815
+
816
+ try:
817
+ # Use AI features based on toggle
818
+ actual_summary_type = summary_type if enable_ai_features else "extractive"
819
+
820
+ result, error = summarizer.process_document(file.name, actual_summary_type, summary_length)
821
+
822
+ if error:
823
+ error_html = f'''
824
+ <div class="error-container">
825
+ <h4>❌ Processing Error</h4>
826
+ <p><strong>Error:</strong> {error}</p>
827
+ <p><small>Please try a different file or check the file format.</small></p>
828
+ </div>
829
+ '''
830
+ return (
831
+ gr.update(visible=False),
832
+ gr.update(visible=False),
833
+ gr.update(visible=False),
834
+ gr.update(visible=False),
835
+ gr.update(value=error_html, visible=True)
836
+ )
837
+
838
+ # Format summary display
839
+ summary_html = f'''
840
+ <div class="summary-container">
841
+ <h3>🎯 Document Summary</h3>
842
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin-bottom: 15px;">
843
+ <div><strong>πŸ“„ File:</strong> {result["file_name"]}</div>
844
+ <div><strong>πŸ“Š Size:</strong> {format_file_size(result["file_size"])}</div>
845
+ <div><strong>πŸ€– Model:</strong> {result["model_used"]}</div>
846
+ <div><strong>πŸ“ Length:</strong> {result["summary_length"].title()}</div>
847
+ </div>
848
+ <div style="background: rgba(255,255,255,0.15); padding: 20px; border-radius: 10px; line-height: 1.6;">
849
+ {result["summary"]}
850
+ </div>
851
+ </div>
852
+ '''
853
+
854
+ # Format comprehensive statistics
855
+ stats = result["stats"]
856
+ readability = result["readability_score"]
857
+
858
+ # Create readability indicator
859
+ readability_color = "#28a745" if readability > 70 else "#ffc107" if readability > 40 else "#dc3545"
860
+ readability_text = "Easy" if readability > 70 else "Moderate" if readability > 40 else "Complex"
861
+
862
+ stats_html = f'''
863
+ <div class="stats-container">
864
+ <h3>πŸ“ˆ Document Analytics</h3>
865
+
866
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;">
867
+ <div class="metric-card">
868
+ <h4 style="margin: 0; color: #007bff;">πŸ“ {stats["word_count"]:,}</h4>
869
+ <small>Words</small>
870
+ </div>
871
+ <div class="metric-card">
872
+ <h4 style="margin: 0; color: #28a745;">⏱️ {stats["estimated_reading_time"]} min</h4>
873
+ <small>Reading Time</small>
874
+ </div>
875
+ <div class="metric-card">
876
+ <h4 style="margin: 0; color: #17a2b8;">πŸ“‘ {stats["sentence_count"]:,}</h4>
877
+ <small>Sentences</small>
878
+ </div>
879
+ <div class="metric-card">
880
+ <h4 style="margin: 0; color: #6f42c1;">🧠 {stats.get("unique_words", "N/A")}</h4>
881
+ <small>Unique Words</small>
882
+ </div>
883
+ </div>
884
+
885
+ <div style="margin: 20px 0;">
886
+ <h4>πŸ“– Readability Score</h4>
887
+ <div class="progress-bar">
888
+ <div class="progress-fill" style="width: {readability}%; background-color: {readability_color};"></div>
889
+ </div>
890
+ <p><strong>{readability:.1f}/100</strong> - {readability_text} to read</p>
891
+ </div>
892
+ '''
893
+
894
+ # Add sentiment analysis if available
895
+ if stats.get('sentiment'):
896
+ sentiment = stats['sentiment']
897
+ sentiment_html = get_sentiment_indicator(sentiment['compound'])
898
+ stats_html += f'''
899
+ <div style="margin: 20px 0;">
900
+ <h4>😊 Document Sentiment</h4>
901
+ {sentiment_html}
902
+ <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin-top: 10px;">
903
+ <small>Positive: {sentiment['positive']:.2f}</small>
904
+ <small>Negative: {sentiment['negative']:.2f}</small>
905
+ <small>Neutral: {sentiment['neutral']:.2f}</small>
906
+ </div>
907
+ </div>
908
+ '''
909
+
910
+ # Add word frequency
911
+ if stats.get('top_words'):
912
+ stats_html += f'''
913
+ <div style="margin: 20px 0;">
914
+ <h4>πŸ”€ Most Frequent Words</h4>
915
+ <div style="display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px;">
916
+ {" ".join([f'<span style="background: rgba(255,255,255,0.2); padding: 6px 12px; border-radius: 15px; font-size: 13px;">{word} ({count})</span>' for word, count in stats["top_words"][:10]])}
917
+ </div>
918
+ </div>
919
+ '''
920
+
921
+ stats_html += '</div>'
922
+
923
+ # Format key points
924
+ key_points_html = f'''
925
+ <div class="key-points-container">
926
+ <h3>🎯 Key Insights</h3>
927
+ <ul style="list-style: none; padding: 0;">
928
+ '''
929
+ for i, point in enumerate(result["key_points"], 1):
930
+ key_points_html += f'<li style="margin-bottom: 12px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 8px;"><strong>{i}.</strong> {point}</li>'
931
+ key_points_html += '</ul></div>'
932
+
933
+ # Format document outline
934
+ outline_html = ""
935
+ if result.get("outline"):
936
+ outline_html = f'''
937
+ <div class="outline-container">
938
+ <h3>πŸ“‹ Document Structure</h3>
939
+ <ol style="padding-left: 20px;">
940
+ '''
941
+ for item in result["outline"]:
942
+ outline_html += f'<li style="margin-bottom: 8px; padding: 5px 0;">{item}</li>'
943
+ outline_html += '</ol></div>'
944
+
945
+ return (
946
+ gr.update(value=summary_html, visible=True),
947
+ gr.update(value=stats_html, visible=True),
948
+ gr.update(value=key_points_html, visible=True),
949
+ gr.update(value=outline_html, visible=True if outline_html else False),
950
+ gr.update(visible=False)
951
+ )
952
+
953
+ except Exception as e:
954
+ error_html = f'''
955
+ <div class="error-container">
956
+ <h4>πŸ’₯ Unexpected Error</h4>
957
+ <p><strong>Details:</strong> {str(e)}</p>
958
+ <p><small>Please try again or contact support if the issue persists.</small></p>
959
+ </div>
960
+ '''
961
+ return (
962
+ gr.update(visible=False),
963
+ gr.update(visible=False),
964
+ gr.update(visible=False),
965
+ gr.update(visible=False),
966
+ gr.update(value=error_html, visible=True)
967
+ )
968
+
969
+ # Create the main interface
970
+ with gr.Blocks(css=css, title="πŸš€ CatalystGPT-4 Document Summarizer", theme=gr.themes.Soft()) as demo:
971
+
972
+ # Header
973
+ gr.HTML("""
974
+ <div class="catalyst-header">
975
+ <h1 style="margin: 0; font-size: 3em; font-weight: bold;">πŸš€ CatalystGPT-4</h1>
976
+ <h2 style="margin: 10px 0; font-size: 1.5em; opacity: 0.9;">Advanced Document Summarizer</h2>
977
+ <p style="margin: 15px 0 0 0; font-size: 1.1em; opacity: 0.8;">
978
+ Powered by AI β€’ Extractive & Abstractive Summarization β€’ Comprehensive Analytics
979
+ </p>
980
+ </div>
981
+ """)
982
+
983
+ with gr.Row():
984
+ # Left column - Enhanced Controls
985
+ with gr.Column(scale=1):
986
+ with gr.Group():
987
+ gr.HTML('<div class="control-panel">')
988
+
989
+ gr.Markdown("### πŸ“ Document Upload")
990
+ file_upload = gr.File(
991
+ label="Choose your document",
992
+ file_types=[".pdf", ".docx", ".txt", ".md", ".rtf"],
993
+ elem_classes="file-upload-area"
994
+ )
995
+
996
+ gr.Markdown("### βš™οΈ Analysis Settings")
997
+
998
+ enable_ai_features = gr.Checkbox(
999
+ label="πŸ€– Enable AI Features",
1000
+ value=TRANSFORMERS_AVAILABLE,
1001
+ info="Use advanced AI models for better summarization",
1002
+ interactive=TRANSFORMERS_AVAILABLE
1003
+ )
1004
+
1005
+ summary_type = gr.Radio(
1006
+ choices=[
1007
+ ("🧠 AI Summary (Neural)", "ai"),
1008
+ ("πŸ“ Extractive Summary", "extractive")
1009
+ ],
1010
+ value="ai" if TRANSFORMERS_AVAILABLE else "extractive",
1011
+ label="Summarization Method",
1012
+ info="AI generates new text, Extractive selects key sentences"
1013
+ )
1014
+
1015
+ summary_length = gr.Radio(
1016
+ choices=[
1017
+ ("⚑ Short & Concise", "short"),
1018
+ ("πŸ“„ Standard Length", "medium"),
1019
+ ("πŸ“– Detailed Analysis", "long"),
1020
+ ("πŸ” Comprehensive Report", "detailed")
1021
+ ],
1022
+ value="medium",
1023
+ label="Analysis Depth",
1024
+ info="Choose the level of detail for your analysis"
1025
+ )
1026
+
1027
+ analyze_btn = gr.Button(
1028
+ "πŸš€ Analyze Document",
1029
+ variant="primary",
1030
+ size="lg",
1031
+ elem_classes="analyze-button"
1032
+ )
1033
+
1034
+ gr.HTML('</div>')
1035
+
1036
+ # Enhanced Library Status
1037
+ gr.Markdown(f"""
1038
+ ### πŸ“Š System Status
1039
+
1040
+ **Core Features:**
1041
+ - πŸ“„ **PDF Processing:** {"βœ… PyMuPDF" if PYMUPDF_AVAILABLE else ("βœ… PyPDF2" if PDF_AVAILABLE else "❌ Not Available")}
1042
+ - πŸ“ **Word Documents:** {"βœ… Available" if DOCX_AVAILABLE else "❌ Install python-docx"}
1043
+ - πŸ€– **AI Summarization:** {"βœ… Available" if TRANSFORMERS_AVAILABLE else "❌ Install transformers"}
1044
+ - πŸ“ˆ **Advanced NLP:** {"βœ… Available" if NLTK_AVAILABLE else "⚠️ Basic processing"}
1045
+ - 😊 **Sentiment Analysis:** {"βœ… Available" if (NLTK_AVAILABLE and summarizer.sentiment_analyzer) else "❌ Not Available"}
1046
+
1047
+ **Performance:**
1048
+ - πŸ”§ **Device:** {"GPU" if DEVICE >= 0 else "CPU"}
1049
+ - πŸ’Ύ **Cache:** {"Enabled" if summarizer.cache is not None else "Disabled"}
1050
+ """)
1051
+
1052
+ # Right column - Enhanced Results
1053
+ with gr.Column(scale=2):
1054
+
1055
+ # Welcome message
1056
+ welcome_msg = gr.HTML(
1057
+ value="""
1058
+ <div style="text-align: center; padding: 80px 20px; color: #666;">
1059
+ <div style="font-size: 4em; margin-bottom: 20px;">πŸ“š</div>
1060
+ <h2 style="color: #333; margin-bottom: 15px;">Ready for Analysis</h2>
1061
+ <p style="font-size: 1.1em; margin-bottom: 10px;">Upload any document to unlock AI-powered insights</p>
1062
+ <p><small style="color: #888;">Supports PDF, Word, Text, Markdown, and RTF files</small></p>
1063
+ <div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px; display: inline-block;">
1064
+ <strong>Features:</strong> AI Summarization β€’ Key Points β€’ Analytics β€’ Sentiment Analysis
1065
+ </div>
1066
+ </div>
1067
+ """,
1068
+ visible=True
1069
+ )
1070
+
1071
+ # Results sections
1072
+ summary_display = gr.HTML(visible=False)
1073
+ stats_display = gr.HTML(visible=False)
1074
+ key_points_display = gr.HTML(visible=False)
1075
+ outline_display = gr.HTML(visible=False)
1076
+ error_display = gr.HTML(visible=False)
1077
+
1078
+ # Event handlers
1079
+ def on_file_change(file):
1080
+ if file is None:
1081
+ return (
1082
+ gr.update(visible=True),
1083
+ gr.update(visible=False),
1084
+ gr.update(visible=False),
1085
+ gr.update(visible=False),
1086
+ gr.update(visible=False),
1087
+ gr.update(visible=False)
1088
+ )
1089
+ else:
1090
+ return (
1091
+ gr.update(visible=False),
1092
+ gr.update(visible=False),
1093
+ gr.update(visible=False),
1094
+ gr.update(visible=False),
1095
+ gr.update(visible=False),
1096
+ gr.update(visible=False)
1097
+ )
1098
+
1099
+ # Auto-hide welcome when file uploaded
1100
+ file_upload.change(
1101
+ fn=on_file_change,
1102
+ inputs=[file_upload],
1103
+ outputs=[welcome_msg, summary_display, stats_display, key_points_display, outline_display, error_display]
1104
+ )
1105
+
1106
+ # Process document on button click
1107
+ analyze_btn.click(
1108
+ fn=process_and_display,
1109
+ inputs=[file_upload, summary_type, summary_length, enable_ai_features],
1110
+ outputs=[summary_display, stats_display, key_points_display, outline_display, error_display]
1111
+ )
1112
+
1113
+ # Auto-process when settings change (if file uploaded)
1114
+ for component in [summary_type, summary_length, enable_ai_features]:
1115
+ component.change(
1116
+ fn=process_and_display,
1117
+ inputs=[file_upload, summary_type, summary_length, enable_ai_features],
1118
+ outputs=[summary_display, stats_display, key_points_display, outline_display, error_display]
1119
+ )
1120
+
1121
+ # Enhanced Footer
1122
+ gr.HTML("""
1123
+ <div style="margin-top: 50px; padding: 30px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
1124
+ border-radius: 15px; text-align: center; border-top: 3px solid #007bff;">
1125
+ <h3 style="color: #333; margin-bottom: 20px;">πŸ› οΈ Installation & Setup</h3>
1126
+
1127
+ <div style="background: #343a40; color: #fff; padding: 15px; border-radius: 8px;
1128
+ font-family: 'Courier New', monospace; margin: 15px 0;">
1129
+ <strong>Quick Install:</strong><br>
1130
+ pip install gradio python-docx PyPDF2 transformers torch nltk PyMuPDF
1131
+ </div>
1132
+
1133
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 20px;">
1134
+ <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1135
+ <strong>🎯 Core Features</strong><br>
1136
+ <small>Multi-format support, AI summarization, key insights extraction</small>
1137
+ </div>
1138
+ <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1139
+ <strong>πŸ“Š Advanced Analytics</strong><br>
1140
+ <small>Sentiment analysis, readability scoring, word frequency</small>
1141
+ </div>
1142
+ <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1143
+ <strong>πŸš€ Performance</strong><br>
1144
+ <small>Intelligent caching, GPU acceleration, batch processing</small>
1145
+ </div>
1146
+ </div>
1147
+
1148
+ <p style="margin-top: 20px; color: #666;">
1149
+ <strong>CatalystGPT-4</strong> - Advanced Document Analysis Platform
1150
+ </p>
1151
+ </div>
1152
+ """)
1153
+
1154
+ return demo
1155
+
1156
+ if __name__ == "__main__":
1157
+ demo = create_catalyst_interface()
1158
+ demo.launch(
1159
+ server_name="0.0.0.0",
1160
+ server_port=7860,
1161
+ show_error=True,
1162
+ show_tips=True,
1163
+ enable_queue=True
1164
+ )