|
import gradio as gr |
|
import os |
|
import re |
|
import json |
|
import tempfile |
|
import hashlib |
|
from pathlib import Path |
|
from datetime import datetime |
|
from typing import Dict, List, Tuple, Optional, Union |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
try: |
|
from docx import Document |
|
DOCX_AVAILABLE = True |
|
except ImportError: |
|
DOCX_AVAILABLE = False |
|
logger.warning("python-docx not installed. DOCX processing will be disabled.") |
|
|
|
try: |
|
import PyPDF2 |
|
PDF_AVAILABLE = True |
|
except ImportError: |
|
PDF_AVAILABLE = False |
|
logger.warning("PyPDF2 not installed. PDF processing will be disabled.") |
|
|
|
try: |
|
import fitz |
|
PYMUPDF_AVAILABLE = True |
|
except ImportError: |
|
PYMUPDF_AVAILABLE = False |
|
|
|
|
|
try: |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.frequency import FreqDist |
|
from nltk.sentiment import SentimentIntensityAnalyzer |
|
NLTK_AVAILABLE = True |
|
|
|
required_nltk_data = ['punkt', 'stopwords', 'vader_lexicon'] |
|
for data_name in required_nltk_data: |
|
try: |
|
if data_name == 'punkt': |
|
nltk.data.find('tokenizers/punkt') |
|
elif data_name == 'stopwords': |
|
nltk.data.find('corpora/stopwords') |
|
elif data_name == 'vader_lexicon': |
|
nltk.data.find('vader_lexicon') |
|
except LookupError: |
|
nltk.download(data_name, quiet=True) |
|
except ImportError: |
|
NLTK_AVAILABLE = False |
|
logger.warning("NLTK not installed. Advanced text analysis will be limited.") |
|
|
|
try: |
|
from transformers import pipeline |
|
import torch |
|
TRANSFORMERS_AVAILABLE = True |
|
DEVICE = 0 if torch.cuda.is_available() else -1 |
|
except ImportError: |
|
TRANSFORMERS_AVAILABLE = False |
|
DEVICE = -1 |
|
logger.warning("transformers not installed. AI summarization will use basic extraction methods.") |
|
|
|
class AdvancedDocumentSummarizer: |
|
"""CatalystGPT-4 Advanced Document Summarizer with enhanced features""" |
|
|
|
def __init__(self): |
|
self.summarizer = None |
|
self.sentiment_analyzer = None |
|
self.cache = {} |
|
|
|
|
|
if TRANSFORMERS_AVAILABLE: |
|
self._initialize_ai_models() |
|
|
|
|
|
if NLTK_AVAILABLE: |
|
try: |
|
self.sentiment_analyzer = SentimentIntensityAnalyzer() |
|
except Exception as e: |
|
logger.warning(f"Failed to initialize sentiment analyzer: {e}") |
|
|
|
def _initialize_ai_models(self): |
|
"""Initialize AI models with error handling and fallbacks""" |
|
models_to_try = [ |
|
"facebook/bart-large-cnn", |
|
"t5-small", |
|
"google/pegasus-xsum" |
|
] |
|
|
|
for model_name in models_to_try: |
|
try: |
|
self.summarizer = pipeline( |
|
"summarization", |
|
model=model_name, |
|
device=DEVICE, |
|
torch_dtype=torch.float16 if DEVICE >= 0 else torch.float32 |
|
) |
|
logger.info(f"Successfully loaded {model_name}") |
|
break |
|
except Exception as e: |
|
logger.warning(f"Failed to load {model_name}: {e}") |
|
continue |
|
|
|
def _get_file_hash(self, file_path: str) -> str: |
|
"""Generate hash for file caching""" |
|
try: |
|
with open(file_path, 'rb') as f: |
|
content = f.read() |
|
return hashlib.md5(content).hexdigest() |
|
except Exception: |
|
return str(datetime.now().timestamp()) |
|
|
|
def extract_text_from_pdf(self, file_path: str) -> str: |
|
"""Enhanced PDF text extraction with better error handling""" |
|
text = "" |
|
|
|
|
|
if PYMUPDF_AVAILABLE: |
|
try: |
|
doc = fitz.open(file_path) |
|
for page_num, page in enumerate(doc): |
|
page_text = page.get_text() |
|
if page_text.strip(): |
|
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" |
|
doc.close() |
|
|
|
if text.strip(): |
|
return text |
|
except Exception as e: |
|
logger.error(f"PyMuPDF extraction failed: {e}") |
|
|
|
|
|
if PDF_AVAILABLE: |
|
try: |
|
with open(file_path, 'rb') as file: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
for page_num, page in enumerate(pdf_reader.pages): |
|
page_text = page.extract_text() |
|
if page_text.strip(): |
|
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" |
|
|
|
if text.strip(): |
|
return text |
|
except Exception as e: |
|
logger.error(f"PyPDF2 extraction failed: {e}") |
|
|
|
return "PDF processing libraries not available or extraction failed." |
|
|
|
def extract_text_from_docx(self, file_path: str) -> str: |
|
"""Enhanced DOCX extraction with better formatting preservation""" |
|
if not DOCX_AVAILABLE: |
|
return "python-docx library not available." |
|
|
|
try: |
|
doc = Document(file_path) |
|
text_parts = [] |
|
|
|
|
|
for paragraph in doc.paragraphs: |
|
if paragraph.text.strip(): |
|
text_parts.append(paragraph.text) |
|
|
|
|
|
for table_num, table in enumerate(doc.tables): |
|
text_parts.append(f"\n--- Table {table_num + 1} ---") |
|
for row in table.rows: |
|
row_text = " | ".join(cell.text.strip() for cell in row.cells) |
|
if row_text.strip(): |
|
text_parts.append(row_text) |
|
|
|
return "\n".join(text_parts) |
|
except Exception as e: |
|
logger.error(f"Error processing DOCX file: {e}") |
|
return f"Error processing DOCX file: {str(e)}" |
|
|
|
def get_enhanced_document_stats(self, text: str) -> Dict: |
|
"""Get comprehensive document statistics with sentiment analysis""" |
|
if not text.strip(): |
|
return {} |
|
|
|
|
|
word_count = len(text.split()) |
|
char_count = len(text) |
|
char_count_no_spaces = len(text.replace(' ', '')) |
|
paragraph_count = len([p for p in text.split('\n\n') if p.strip()]) |
|
|
|
stats = { |
|
'word_count': word_count, |
|
'character_count': char_count, |
|
'character_count_no_spaces': char_count_no_spaces, |
|
'paragraph_count': paragraph_count, |
|
'estimated_reading_time': max(1, round(word_count / 200)), |
|
'estimated_speaking_time': max(1, round(word_count / 150)) |
|
} |
|
|
|
if NLTK_AVAILABLE: |
|
sentences = sent_tokenize(text) |
|
stats['sentence_count'] = len(sentences) |
|
stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0 |
|
|
|
|
|
words = word_tokenize(text.lower()) |
|
stop_words = set(stopwords.words('english')) |
|
filtered_words = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 2] |
|
|
|
if filtered_words: |
|
freq_dist = FreqDist(filtered_words) |
|
stats['top_words'] = freq_dist.most_common(15) |
|
stats['unique_words'] = len(set(filtered_words)) |
|
stats['lexical_diversity'] = round(len(set(filtered_words)) / len(filtered_words), 3) if filtered_words else 0 |
|
|
|
|
|
if self.sentiment_analyzer: |
|
try: |
|
sentiment_scores = self.sentiment_analyzer.polarity_scores(text[:5000]) |
|
stats['sentiment'] = { |
|
'compound': round(sentiment_scores['compound'], 3), |
|
'positive': round(sentiment_scores['pos'], 3), |
|
'negative': round(sentiment_scores['neg'], 3), |
|
'neutral': round(sentiment_scores['neu'], 3) |
|
} |
|
except Exception as e: |
|
logger.error(f"Sentiment analysis failed: {e}") |
|
else: |
|
|
|
sentences = [s.strip() for s in text.split('.') if s.strip()] |
|
stats['sentence_count'] = len(sentences) |
|
stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0 |
|
|
|
words = re.findall(r'\b\w+\b', text.lower()) |
|
word_freq = {} |
|
for word in words: |
|
if len(word) > 2: |
|
word_freq[word] = word_freq.get(word, 0) + 1 |
|
|
|
stats['top_words'] = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15] |
|
stats['unique_words'] = len(set(words)) |
|
|
|
return stats |
|
|
|
def advanced_extractive_summary(self, text: str, num_sentences: int = 3) -> str: |
|
"""Enhanced extractive summarization with improved sentence scoring""" |
|
if not text.strip(): |
|
return "No text to summarize." |
|
|
|
if NLTK_AVAILABLE: |
|
sentences = sent_tokenize(text) |
|
else: |
|
sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] |
|
|
|
if len(sentences) <= num_sentences: |
|
return text |
|
|
|
|
|
scored_sentences = [] |
|
total_sentences = len(sentences) |
|
|
|
|
|
all_words = re.findall(r'\b\w+\b', text.lower()) |
|
word_freq = {} |
|
for word in all_words: |
|
if len(word) > 2: |
|
word_freq[word] = word_freq.get(word, 0) + 1 |
|
|
|
|
|
importance_keywords = [ |
|
'conclusion', 'summary', 'result', 'finding', 'important', 'significant', |
|
'key', 'main', 'primary', 'essential', 'crucial', 'objective', 'goal', |
|
'recommendation', 'suggest', 'propose', 'indicate', 'show', 'demonstrate' |
|
] |
|
|
|
for i, sentence in enumerate(sentences): |
|
if len(sentence.split()) < 5: |
|
continue |
|
|
|
score = 0 |
|
sentence_lower = sentence.lower() |
|
sentence_words = sentence.split() |
|
|
|
|
|
if i < total_sentences * 0.15: |
|
score += 3 |
|
elif i > total_sentences * 0.85: |
|
score += 2 |
|
elif total_sentences * 0.4 <= i <= total_sentences * 0.6: |
|
score += 1 |
|
|
|
|
|
word_count = len(sentence_words) |
|
if 12 <= word_count <= 25: |
|
score += 3 |
|
elif 8 <= word_count <= 35: |
|
score += 2 |
|
elif 5 <= word_count <= 45: |
|
score += 1 |
|
|
|
|
|
keyword_score = sum(2 if keyword in sentence_lower else 0 for keyword in importance_keywords) |
|
score += min(keyword_score, 6) |
|
|
|
|
|
tf_score = 0 |
|
for word in sentence_words: |
|
word_lower = word.lower() |
|
if word_lower in word_freq and len(word_lower) > 3: |
|
tf_score += min(word_freq[word_lower], 5) |
|
score += min(tf_score / len(sentence_words), 3) |
|
|
|
|
|
if any(indicator in sentence for indicator in [':', 'β', '"', '(']): |
|
score += 1 |
|
|
|
|
|
if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence): |
|
score += 1 |
|
|
|
scored_sentences.append((sentence, score, i)) |
|
|
|
|
|
scored_sentences.sort(key=lambda x: x[1], reverse=True) |
|
selected_sentences = scored_sentences[:num_sentences] |
|
|
|
|
|
selected_sentences.sort(key=lambda x: x[2]) |
|
|
|
return ' '.join([s[0] for s in selected_sentences]) |
|
|
|
def intelligent_chunking(self, text: str, max_chunk_size: int = 1024) -> List[str]: |
|
"""Intelligently chunk text while preserving semantic boundaries""" |
|
if len(text) <= max_chunk_size: |
|
return [text] |
|
|
|
chunks = [] |
|
|
|
|
|
paragraphs = text.split('\n\n') |
|
current_chunk = "" |
|
|
|
for paragraph in paragraphs: |
|
|
|
if len(paragraph) > max_chunk_size: |
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
current_chunk = "" |
|
|
|
|
|
if NLTK_AVAILABLE: |
|
sentences = sent_tokenize(paragraph) |
|
else: |
|
sentences = [s.strip() for s in paragraph.split('.') if s.strip()] |
|
|
|
temp_chunk = "" |
|
for sentence in sentences: |
|
if len(temp_chunk + sentence) <= max_chunk_size: |
|
temp_chunk += sentence + ". " |
|
else: |
|
if temp_chunk: |
|
chunks.append(temp_chunk.strip()) |
|
temp_chunk = sentence + ". " |
|
|
|
if temp_chunk: |
|
current_chunk = temp_chunk |
|
else: |
|
|
|
if len(current_chunk + paragraph) <= max_chunk_size: |
|
current_chunk += paragraph + "\n\n" |
|
else: |
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
current_chunk = paragraph + "\n\n" |
|
|
|
if current_chunk: |
|
chunks.append(current_chunk.strip()) |
|
|
|
return [chunk for chunk in chunks if chunk.strip()] |
|
|
|
def ai_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str: |
|
"""Enhanced AI-powered summarization with better chunking and error handling""" |
|
if not self.summarizer: |
|
return self.advanced_extractive_summary(text) |
|
|
|
try: |
|
|
|
chunks = self.intelligent_chunking(text, 1000) |
|
|
|
if not chunks: |
|
return "No meaningful content found for summarization." |
|
|
|
summaries = [] |
|
for i, chunk in enumerate(chunks): |
|
if len(chunk.strip()) < 50: |
|
continue |
|
|
|
try: |
|
|
|
chunk_max_length = min(max_length, max(50, len(chunk.split()) // 3)) |
|
chunk_min_length = min(min_length, chunk_max_length // 2) |
|
|
|
summary = self.summarizer( |
|
chunk, |
|
max_length=chunk_max_length, |
|
min_length=chunk_min_length, |
|
do_sample=False, |
|
truncation=True |
|
) |
|
summaries.append(summary[0]['summary_text']) |
|
|
|
except Exception as e: |
|
logger.warning(f"Error summarizing chunk {i}: {e}") |
|
|
|
fallback_summary = self.advanced_extractive_summary(chunk, 2) |
|
if fallback_summary and fallback_summary != "No text to summarize.": |
|
summaries.append(fallback_summary) |
|
|
|
if not summaries: |
|
return self.advanced_extractive_summary(text) |
|
|
|
|
|
if len(summaries) == 1: |
|
return summaries[0] |
|
else: |
|
combined_summary = ' '.join(summaries) |
|
|
|
|
|
if len(combined_summary.split()) > max_length * 1.5: |
|
try: |
|
final_summary = self.summarizer( |
|
combined_summary, |
|
max_length=max_length, |
|
min_length=min_length, |
|
do_sample=False, |
|
truncation=True |
|
) |
|
return final_summary[0]['summary_text'] |
|
except Exception: |
|
return combined_summary[:max_length * 10] |
|
|
|
return combined_summary |
|
|
|
except Exception as e: |
|
logger.error(f"AI summarization failed: {e}") |
|
return self.advanced_extractive_summary(text) |
|
|
|
def generate_enhanced_key_points(self, text: str, num_points: int = 7) -> List[str]: |
|
"""Generate key points with improved extraction and categorization""" |
|
if not text.strip(): |
|
return [] |
|
|
|
if NLTK_AVAILABLE: |
|
sentences = sent_tokenize(text) |
|
else: |
|
sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] |
|
|
|
|
|
key_indicators = { |
|
'conclusions': ['conclusion', 'conclude', 'result', 'outcome', 'finding', 'discovered'], |
|
'objectives': ['objective', 'goal', 'purpose', 'aim', 'target', 'mission'], |
|
'methods': ['method', 'approach', 'technique', 'procedure', 'process', 'way'], |
|
'importance': ['important', 'significant', 'crucial', 'essential', 'key', 'main', 'primary'], |
|
'recommendations': ['recommend', 'suggest', 'propose', 'should', 'must', 'need to'], |
|
'problems': ['problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'concern'], |
|
'benefits': ['benefit', 'advantage', 'improvement', 'enhancement', 'positive', 'gain'] |
|
} |
|
|
|
scored_sentences = [] |
|
for sentence in sentences: |
|
if len(sentence.split()) < 6: |
|
continue |
|
|
|
score = 0 |
|
sentence_lower = sentence.lower() |
|
category = 'general' |
|
|
|
|
|
for cat, indicators in key_indicators.items(): |
|
category_score = sum(2 if indicator in sentence_lower else 0 for indicator in indicators) |
|
if category_score > score: |
|
score = category_score |
|
category = cat |
|
|
|
|
|
if sentence.strip().startswith(('β’', '-', '1.', '2.', '3.', '4.', '5.')): |
|
score += 4 |
|
|
|
|
|
if any(punct in sentence for punct in [':', ';', 'β', '"']): |
|
score += 1 |
|
|
|
|
|
word_count = len(sentence.split()) |
|
if 8 <= word_count <= 20: |
|
score += 3 |
|
elif 6 <= word_count <= 30: |
|
score += 2 |
|
elif 4 <= word_count <= 40: |
|
score += 1 |
|
|
|
|
|
if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence): |
|
score += 2 |
|
|
|
|
|
generic_words = ['the', 'this', 'that', 'there', 'it', 'they'] |
|
if sentence.split()[0].lower() in generic_words: |
|
score -= 1 |
|
|
|
if score > 0: |
|
scored_sentences.append((sentence.strip(), score, category)) |
|
|
|
|
|
scored_sentences.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
selected_points = [] |
|
used_categories = set() |
|
|
|
|
|
for sentence, score, category in scored_sentences: |
|
if len(selected_points) >= num_points: |
|
break |
|
if category not in used_categories: |
|
selected_points.append(sentence) |
|
used_categories.add(category) |
|
|
|
|
|
for sentence, score, category in scored_sentences: |
|
if len(selected_points) >= num_points: |
|
break |
|
if sentence not in selected_points: |
|
selected_points.append(sentence) |
|
|
|
return selected_points[:num_points] |
|
|
|
def generate_document_outline(self, text: str) -> List[str]: |
|
"""Generate a structured outline of the document""" |
|
if not text.strip(): |
|
return [] |
|
|
|
lines = text.split('\n') |
|
outline = [] |
|
|
|
|
|
header_patterns = [ |
|
r'^#{1,6}\s+(.+)$', |
|
r'^(\d+\.?\s+[A-Z][^.]{10,})$', |
|
r'^([A-Z][A-Z\s]{5,})$', |
|
r'^([A-Z][a-z\s]{10,}:)$', |
|
] |
|
|
|
for line in lines: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
for pattern in header_patterns: |
|
match = re.match(pattern, line) |
|
if match: |
|
outline.append(match.group(1).strip()) |
|
break |
|
|
|
return outline[:10] |
|
|
|
def process_document(self, file_path: str, summary_type: str = "ai", |
|
summary_length: str = "medium") -> Tuple[Optional[Dict], Optional[str]]: |
|
"""Enhanced document processing with caching and comprehensive analysis""" |
|
if not file_path: |
|
return None, "No file provided." |
|
|
|
try: |
|
|
|
file_hash = self._get_file_hash(file_path) |
|
cache_key = f"{file_hash}_{summary_type}_{summary_length}" |
|
|
|
if cache_key in self.cache: |
|
logger.info("Returning cached result") |
|
return self.cache[cache_key], None |
|
|
|
|
|
file_extension = Path(file_path).suffix.lower() |
|
|
|
if file_extension == '.pdf': |
|
text = self.extract_text_from_pdf(file_path) |
|
elif file_extension == '.docx': |
|
text = self.extract_text_from_docx(file_path) |
|
elif file_extension in ['.txt', '.md', '.rtf']: |
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
|
text = f.read() |
|
else: |
|
return None, f"Unsupported file type: {file_extension}" |
|
|
|
if not text.strip() or "not available" in text.lower(): |
|
return None, "No text could be extracted from the document or extraction failed." |
|
|
|
|
|
text = re.sub(r'\n{3,}', '\n\n', text) |
|
text = re.sub(r' {2,}', ' ', text) |
|
|
|
|
|
stats = self.get_enhanced_document_stats(text) |
|
|
|
|
|
length_params = { |
|
"short": {"sentences": 2, "max_length": 80, "min_length": 30}, |
|
"medium": {"sentences": 4, "max_length": 150, "min_length": 50}, |
|
"long": {"sentences": 6, "max_length": 250, "min_length": 100}, |
|
"detailed": {"sentences": 8, "max_length": 400, "min_length": 150} |
|
} |
|
|
|
params = length_params.get(summary_length, length_params["medium"]) |
|
|
|
|
|
if summary_type == "ai" and self.summarizer: |
|
summary = self.ai_summary(text, params["max_length"], params["min_length"]) |
|
else: |
|
summary = self.advanced_extractive_summary(text, params["sentences"]) |
|
|
|
|
|
key_points = self.generate_enhanced_key_points(text, 7) |
|
outline = self.generate_document_outline(text) |
|
|
|
|
|
avg_sentence_length = stats.get('avg_sentence_length', 0) |
|
readability_score = max(0, min(100, 100 - (avg_sentence_length * 2))) |
|
|
|
result = { |
|
'original_text': text[:2000] + "..." if len(text) > 2000 else text, |
|
'full_text_length': len(text), |
|
'summary': summary, |
|
'key_points': key_points, |
|
'outline': outline, |
|
'stats': stats, |
|
'readability_score': readability_score, |
|
'file_name': Path(file_path).name, |
|
'file_size': os.path.getsize(file_path), |
|
'processing_time': datetime.now().isoformat(), |
|
'summary_type': summary_type, |
|
'summary_length': summary_length, |
|
'model_used': 'AI (BART/T5)' if self.summarizer else 'Extractive' |
|
} |
|
|
|
|
|
self.cache[cache_key] = result |
|
|
|
return result, None |
|
|
|
except Exception as e: |
|
logger.error(f"Document processing error: {e}") |
|
return None, f"Error processing document: {str(e)}" |
|
|
|
def create_catalyst_interface(): |
|
"""Create the CatalystGPT-4 document summarizer interface""" |
|
|
|
summarizer = AdvancedDocumentSummarizer() |
|
|
|
|
|
css = """ |
|
.catalyst-header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 30px; |
|
border-radius: 20px; |
|
text-align: center; |
|
margin-bottom: 25px; |
|
box-shadow: 0 10px 30px rgba(0,0,0,0.2); |
|
} |
|
|
|
.summary-container { |
|
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); |
|
color: white; |
|
padding: 25px; |
|
border-radius: 15px; |
|
margin: 15px 0; |
|
box-shadow: 0 8px 25px rgba(0,0,0,0.15); |
|
} |
|
|
|
.stats-container { |
|
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); |
|
color: white; |
|
padding: 20px; |
|
border-radius: 12px; |
|
margin: 15px 0; |
|
box-shadow: 0 6px 20px rgba(0,0,0,0.1); |
|
} |
|
|
|
.key-points-container { |
|
background: linear-gradient(135deg, #4ecdc4 0%, #44a08d 100%); |
|
color: white; |
|
padding: 20px; |
|
border-radius: 12px; |
|
margin: 15px 0; |
|
box-shadow: 0 6px 20px rgba(0,0,0,0.1); |
|
} |
|
|
|
.outline-container { |
|
background: linear-gradient(135deg, #fa709a 0%, #fee140 100%); |
|
color: white; |
|
padding: 20px; |
|
border-radius: 12px; |
|
margin: 15px 0; |
|
box-shadow: 0 6px 20px rgba(0,0,0,0.1); |
|
} |
|
|
|
.error-container { |
|
background: linear-gradient(135deg, #ff9a9e 0%, #fecfef 100%); |
|
color: #721c24; |
|
padding: 20px; |
|
border-radius: 12px; |
|
margin: 15px 0; |
|
border-left: 5px solid #dc3545; |
|
} |
|
|
|
.control-panel { |
|
background: linear-gradient(135deg, #f6f9fc 0%, #e9ecef 100%); |
|
padding: 25px; |
|
border-radius: 15px; |
|
margin: 15px 0; |
|
border: 1px solid #dee2e6; |
|
box-shadow: 0 4px 15px rgba(0,0,0,0.05); |
|
} |
|
|
|
.file-upload-area { |
|
border: 3px dashed #007bff; |
|
border-radius: 15px; |
|
padding: 40px; |
|
text-align: center; |
|
background: linear-gradient(135deg, #f8f9ff 0%, #e3f2fd 100%); |
|
transition: all 0.3s ease; |
|
margin: 15px 0; |
|
} |
|
|
|
.file-upload-area:hover { |
|
border-color: #0056b3; |
|
background: linear-gradient(135deg, #f0f7ff 0%, #e1f5fe 100%); |
|
transform: translateY(-2px); |
|
} |
|
|
|
.metric-card { |
|
background: white; |
|
padding: 15px; |
|
border-radius: 10px; |
|
margin: 5px; |
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
text-align: center; |
|
} |
|
|
|
.sentiment-indicator { |
|
display: inline-block; |
|
padding: 5px 12px; |
|
border-radius: 20px; |
|
font-weight: bold; |
|
font-size: 12px; |
|
margin: 2px; |
|
} |
|
|
|
.sentiment-positive { background: #d4edda; color: #155724; } |
|
.sentiment-negative { background: #f8d7da; color: #721c24; } |
|
.sentiment-neutral { background: #d1ecf1; color: #0c5460; } |
|
|
|
.progress-bar { |
|
background: #e9ecef; |
|
border-radius: 10px; |
|
overflow: hidden; |
|
height: 8px; |
|
margin: 5px 0; |
|
} |
|
|
|
.progress-fill { |
|
height: 100%; |
|
background: linear-gradient(90deg, #28a745, #20c997); |
|
transition: width 0.3s ease; |
|
} |
|
""" |
|
|
|
def format_file_size(size_bytes): |
|
"""Convert bytes to human readable format""" |
|
for unit in ['B', 'KB', 'MB', 'GB']: |
|
if size_bytes < 1024.0: |
|
return f"{size_bytes:.1f} {unit}" |
|
size_bytes /= 1024.0 |
|
return f"{size_bytes:.1f} TB" |
|
|
|
def get_sentiment_indicator(sentiment_score): |
|
"""Get sentiment indicator HTML""" |
|
if sentiment_score > 0.1: |
|
return '<span class="sentiment-indicator sentiment-positive">π Positive</span>' |
|
elif sentiment_score < -0.1: |
|
return '<span class="sentiment-indicator sentiment-negative">π Negative</span>' |
|
else: |
|
return '<span class="sentiment-indicator sentiment-neutral">π Neutral</span>' |
|
|
|
def process_and_display(file, summary_type, summary_length, enable_ai_features): |
|
"""Enhanced processing with comprehensive results display""" |
|
if file is None: |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(value=""" |
|
<div style="text-align: center; padding: 60px; color: #666;"> |
|
<h3>π CatalystGPT-4 Ready</h3> |
|
<p>Upload a document to begin advanced AI-powered analysis</p> |
|
<p><small>Supports: PDF, Word (.docx), Text (.txt, .md, .rtf)</small></p> |
|
</div> |
|
""", visible=True) |
|
) |
|
|
|
try: |
|
|
|
actual_summary_type = summary_type if enable_ai_features else "extractive" |
|
|
|
result, error = summarizer.process_document(file.name, actual_summary_type, summary_length) |
|
|
|
if error: |
|
error_html = f''' |
|
<div class="error-container"> |
|
<h4>β Processing Error</h4> |
|
<p><strong>Error:</strong> {error}</p> |
|
<p><small>Please try a different file or check the file format.</small></p> |
|
</div> |
|
''' |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(value=error_html, visible=True) |
|
) |
|
|
|
|
|
summary_html = f''' |
|
<div class="summary-container"> |
|
<h3>π― Document Summary</h3> |
|
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin-bottom: 15px;"> |
|
<div><strong>π File:</strong> {result["file_name"]}</div> |
|
<div><strong>π Size:</strong> {format_file_size(result["file_size"])}</div> |
|
<div><strong>π€ Model:</strong> {result["model_used"]}</div> |
|
<div><strong>π Length:</strong> {result["summary_length"].title()}</div> |
|
</div> |
|
<div style="background: rgba(255,255,255,0.15); padding: 20px; border-radius: 10px; line-height: 1.6;"> |
|
{result["summary"]} |
|
</div> |
|
</div> |
|
''' |
|
|
|
|
|
stats = result["stats"] |
|
readability = result["readability_score"] |
|
|
|
|
|
readability_color = "#28a745" if readability > 70 else "#ffc107" if readability > 40 else "#dc3545" |
|
readability_text = "Easy" if readability > 70 else "Moderate" if readability > 40 else "Complex" |
|
|
|
stats_html = f''' |
|
<div class="stats-container"> |
|
<h3>π Document Analytics</h3> |
|
|
|
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;"> |
|
<div class="metric-card"> |
|
<h4 style="margin: 0; color: #007bff;">π {stats["word_count"]:,}</h4> |
|
<small>Words</small> |
|
</div> |
|
<div class="metric-card"> |
|
<h4 style="margin: 0; color: #28a745;">β±οΈ {stats["estimated_reading_time"]} min</h4> |
|
<small>Reading Time</small> |
|
</div> |
|
<div class="metric-card"> |
|
<h4 style="margin: 0; color: #17a2b8;">π {stats["sentence_count"]:,}</h4> |
|
<small>Sentences</small> |
|
</div> |
|
<div class="metric-card"> |
|
<h4 style="margin: 0; color: #6f42c1;">π§ {stats.get("unique_words", "N/A")}</h4> |
|
<small>Unique Words</small> |
|
</div> |
|
</div> |
|
|
|
<div style="margin: 20px 0;"> |
|
<h4>π Readability Score</h4> |
|
<div class="progress-bar"> |
|
<div class="progress-fill" style="width: {readability}%; background-color: {readability_color};"></div> |
|
</div> |
|
<p><strong>{readability:.1f}/100</strong> - {readability_text} to read</p> |
|
</div> |
|
''' |
|
|
|
|
|
if stats.get('sentiment'): |
|
sentiment = stats['sentiment'] |
|
sentiment_html = get_sentiment_indicator(sentiment['compound']) |
|
stats_html += f''' |
|
<div style="margin: 20px 0;"> |
|
<h4>π Document Sentiment</h4> |
|
{sentiment_html} |
|
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin-top: 10px;"> |
|
<small>Positive: {sentiment['positive']:.2f}</small> |
|
<small>Negative: {sentiment['negative']:.2f}</small> |
|
<small>Neutral: {sentiment['neutral']:.2f}</small> |
|
</div> |
|
</div> |
|
''' |
|
|
|
|
|
if stats.get('top_words'): |
|
stats_html += f''' |
|
<div style="margin: 20px 0;"> |
|
<h4>π€ Most Frequent Words</h4> |
|
<div style="display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px;"> |
|
{" ".join([f'<span style="background: rgba(255,255,255,0.2); padding: 6px 12px; border-radius: 15px; font-size: 13px;">{word} ({count})</span>' for word, count in stats["top_words"][:10]])} |
|
</div> |
|
</div> |
|
''' |
|
|
|
stats_html += '</div>' |
|
|
|
|
|
key_points_html = f''' |
|
<div class="key-points-container"> |
|
<h3>π― Key Insights</h3> |
|
<ul style="list-style: none; padding: 0;"> |
|
''' |
|
for i, point in enumerate(result["key_points"], 1): |
|
key_points_html += f'<li style="margin-bottom: 12px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 8px;"><strong>{i}.</strong> {point}</li>' |
|
key_points_html += '</ul></div>' |
|
|
|
|
|
outline_html = "" |
|
if result.get("outline"): |
|
outline_html = f''' |
|
<div class="outline-container"> |
|
<h3>π Document Structure</h3> |
|
<ol style="padding-left: 20px;"> |
|
''' |
|
for item in result["outline"]: |
|
outline_html += f'<li style="margin-bottom: 8px; padding: 5px 0;">{item}</li>' |
|
outline_html += '</ol></div>' |
|
|
|
return ( |
|
gr.update(value=summary_html, visible=True), |
|
gr.update(value=stats_html, visible=True), |
|
gr.update(value=key_points_html, visible=True), |
|
gr.update(value=outline_html, visible=True if outline_html else False), |
|
gr.update(visible=False) |
|
) |
|
|
|
except Exception as e: |
|
error_html = f''' |
|
<div class="error-container"> |
|
<h4>π₯ Unexpected Error</h4> |
|
<p><strong>Details:</strong> {str(e)}</p> |
|
<p><small>Please try again or contact support if the issue persists.</small></p> |
|
</div> |
|
''' |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(value=error_html, visible=True) |
|
) |
|
|
|
|
|
with gr.Blocks(css=css, title="π CatalystGPT-4 Document Summarizer", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.HTML(""" |
|
<div class="catalyst-header"> |
|
<h1 style="margin: 0; font-size: 3em; font-weight: bold;">π CatalystGPT-4</h1> |
|
<h2 style="margin: 10px 0; font-size: 1.5em; opacity: 0.9;">Advanced Document Summarizer</h2> |
|
<p style="margin: 15px 0 0 0; font-size: 1.1em; opacity: 0.8;"> |
|
Powered by AI β’ Extractive & Abstractive Summarization β’ Comprehensive Analytics |
|
</p> |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
gr.HTML('<div class="control-panel">') |
|
|
|
gr.Markdown("### π Document Upload") |
|
file_upload = gr.File( |
|
label="Choose your document", |
|
file_types=[".pdf", ".docx", ".txt", ".md", ".rtf"], |
|
elem_classes="file-upload-area" |
|
) |
|
|
|
gr.Markdown("### βοΈ Analysis Settings") |
|
|
|
enable_ai_features = gr.Checkbox( |
|
label="π€ Enable AI Features", |
|
value=TRANSFORMERS_AVAILABLE, |
|
info="Use advanced AI models for better summarization", |
|
interactive=TRANSFORMERS_AVAILABLE |
|
) |
|
|
|
summary_type = gr.Radio( |
|
choices=[ |
|
("π§ AI Summary (Neural)", "ai"), |
|
("π Extractive Summary", "extractive") |
|
], |
|
value="ai" if TRANSFORMERS_AVAILABLE else "extractive", |
|
label="Summarization Method", |
|
info="AI generates new text, Extractive selects key sentences" |
|
) |
|
|
|
summary_length = gr.Radio( |
|
choices=[ |
|
("β‘ Short & Concise", "short"), |
|
("π Standard Length", "medium"), |
|
("π Detailed Analysis", "long"), |
|
("π Comprehensive Report", "detailed") |
|
], |
|
value="medium", |
|
label="Analysis Depth", |
|
info="Choose the level of detail for your analysis" |
|
) |
|
|
|
analyze_btn = gr.Button( |
|
"π Analyze Document", |
|
variant="primary", |
|
size="lg", |
|
elem_classes="analyze-button" |
|
) |
|
|
|
gr.HTML('</div>') |
|
|
|
|
|
gr.Markdown(f""" |
|
### π System Status |
|
|
|
**Core Features:** |
|
- π **PDF Processing:** {"β
PyMuPDF" if PYMUPDF_AVAILABLE else ("β
PyPDF2" if PDF_AVAILABLE else "β Not Available")} |
|
- π **Word Documents:** {"β
Available" if DOCX_AVAILABLE else "β Install python-docx"} |
|
- π€ **AI Summarization:** {"β
Available" if TRANSFORMERS_AVAILABLE else "β Install transformers"} |
|
- π **Advanced NLP:** {"β
Available" if NLTK_AVAILABLE else "β οΈ Basic processing"} |
|
- π **Sentiment Analysis:** {"β
Available" if (NLTK_AVAILABLE and summarizer.sentiment_analyzer) else "β Not Available"} |
|
|
|
**Performance:** |
|
- π§ **Device:** {"GPU" if DEVICE >= 0 else "CPU"} |
|
- πΎ **Cache:** {"Enabled" if summarizer.cache is not None else "Disabled"} |
|
""") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
welcome_msg = gr.HTML( |
|
value=""" |
|
<div style="text-align: center; padding: 80px 20px; color: #666;"> |
|
<div style="font-size: 4em; margin-bottom: 20px;">π</div> |
|
<h2 style="color: #333; margin-bottom: 15px;">Ready for Analysis</h2> |
|
<p style="font-size: 1.1em; margin-bottom: 10px;">Upload any document to unlock AI-powered insights</p> |
|
<p><small style="color: #888;">Supports PDF, Word, Text, Markdown, and RTF files</small></p> |
|
<div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px; display: inline-block;"> |
|
<strong>Features:</strong> AI Summarization β’ Key Points β’ Analytics β’ Sentiment Analysis |
|
</div> |
|
</div> |
|
""", |
|
visible=True |
|
) |
|
|
|
|
|
summary_display = gr.HTML(visible=False) |
|
stats_display = gr.HTML(visible=False) |
|
key_points_display = gr.HTML(visible=False) |
|
outline_display = gr.HTML(visible=False) |
|
error_display = gr.HTML(visible=False) |
|
|
|
|
|
def on_file_change(file): |
|
if file is None: |
|
return ( |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
else: |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
|
|
|
|
file_upload.change( |
|
fn=on_file_change, |
|
inputs=[file_upload], |
|
outputs=[welcome_msg, summary_display, stats_display, key_points_display, outline_display, error_display] |
|
) |
|
|
|
|
|
analyze_btn.click( |
|
fn=process_and_display, |
|
inputs=[file_upload, summary_type, summary_length, enable_ai_features], |
|
outputs=[summary_display, stats_display, key_points_display, outline_display, error_display] |
|
) |
|
|
|
|
|
for component in [summary_type, summary_length, enable_ai_features]: |
|
component.change( |
|
fn=process_and_display, |
|
inputs=[file_upload, summary_type, summary_length, enable_ai_features], |
|
outputs=[summary_display, stats_display, key_points_display, outline_display, error_display] |
|
) |
|
|
|
|
|
gr.HTML(""" |
|
<div style="margin-top: 50px; padding: 30px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); |
|
border-radius: 15px; text-align: center; border-top: 3px solid #007bff;"> |
|
<h3 style="color: #333; margin-bottom: 20px;">π οΈ Installation & Setup</h3> |
|
|
|
<div style="background: #343a40; color: #fff; padding: 15px; border-radius: 8px; |
|
font-family: 'Courier New', monospace; margin: 15px 0;"> |
|
<strong>Quick Install:</strong><br> |
|
pip install gradio python-docx PyPDF2 transformers torch nltk PyMuPDF |
|
</div> |
|
|
|
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 20px;"> |
|
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> |
|
<strong>π― Core Features</strong><br> |
|
<small>Multi-format support, AI summarization, key insights extraction</small> |
|
</div> |
|
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> |
|
<strong>π Advanced Analytics</strong><br> |
|
<small>Sentiment analysis, readability scoring, word frequency</small> |
|
</div> |
|
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> |
|
<strong>π Performance</strong><br> |
|
<small>Intelligent caching, GPU acceleration, batch processing</small> |
|
</div> |
|
</div> |
|
|
|
<p style="margin-top: 20px; color: #666;"> |
|
<strong>CatalystGPT-4</strong> - Advanced Document Analysis Platform |
|
</p> |
|
</div> |
|
""") |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_catalyst_interface() |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
show_error=True, |
|
show_tips=True, |
|
enable_queue=True |
|
) |