import streamlit as st import pickle import docx import PyPDF2 import re from huggingface_hub import hf_hub_download from io import BytesIO # Set page configuration st.set_page_config( page_title="IntelliCV: AI Resume Analyzer", page_icon="📄", layout="centered", initial_sidebar_state="expanded" ) # Add some basic CSS styling directly in the app st.markdown(""" """, unsafe_allow_html=True) # Cache model loading with improved error handling @st.cache_resource(show_spinner="Loading AI models...") def load_models(): try: repo_id = "psychomita/intellicv-models" clf_path = hf_hub_download(repo_id=repo_id, filename="clf.pkl", repo_type="model") tfidf_path = hf_hub_download(repo_id=repo_id, filename="tfidf.pkl", repo_type="model") encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pkl", repo_type="model") svc_model = pickle.load(open(clf_path, 'rb')) tfidf = pickle.load(open(tfidf_path, 'rb')) le = pickle.load(open(encoder_path, 'rb')) return svc_model, tfidf, le except Exception as e: st.error(f"Failed to load models: {str(e)}") return None, None, None # Load models svc_model, tfidf, le = load_models() def cleanResume(txt): """Improved text cleaning function with more comprehensive patterns""" if not isinstance(txt, str): return "" clean_patterns = [ (r'http\S+|www\S+|https\S+', ' '), # URLs (r'\bRT\b|\bretweet\b', ' '), # Retweets (r'#\S+', ' '), # Hashtags (r'@\S+', ' '), # Mentions (r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' '), # Punctuation (r'[^\x00-\x7f]', ' '), # Non-ASCII chars (r'\s+', ' '), # Extra whitespace (r'\b\d+\b', ' '), # Standalone numbers (r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ') # Email addresses ] cleanText = txt for pattern, repl in clean_patterns: cleanText = re.sub(pattern, repl, cleanText) return cleanText.strip() def extract_text_from_pdf(file): """Improved PDF text extraction with error handling""" try: pdf_reader = PyPDF2.PdfReader(BytesIO(file.read())) text = [] for page in pdf_reader.pages: try: page_text = page.extract_text() or '' text.append(page_text) except Exception as e: st.warning(f"Could not extract text from one page: {str(e)}") continue return ' '.join(text) except Exception as e: raise ValueError(f"Failed to read PDF file: {str(e)}") def extract_text_from_docx(file): """Improved DOCX text extraction with error handling""" try: doc = docx.Document(BytesIO(file.read())) return '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()) except Exception as e: raise ValueError(f"Failed to read DOCX file: {str(e)}") def extract_text_from_txt(file): """Improved text file extraction with multiple encoding attempts""" encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252'] for encoding in encodings: try: return BytesIO(file.read()).read().decode(encoding) except UnicodeDecodeError: file.seek(0) raise ValueError("Failed to decode text file with common encodings") def handle_file_upload(uploaded_file): """Handle file upload with better type checking""" if not uploaded_file: return None file_extension = uploaded_file.name.split('.')[-1].lower() try: if file_extension == 'pdf': return extract_text_from_pdf(uploaded_file) elif file_extension == 'docx': return extract_text_from_docx(uploaded_file) elif file_extension == 'txt': return extract_text_from_txt(uploaded_file) else: raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.") except Exception as e: raise ValueError(f"Error processing file: {str(e)}") def predict_category(input_resume): """Make prediction with input validation""" if not input_resume or not isinstance(input_resume, str): return "Unknown" try: cleaned_text = cleanResume(input_resume) if not cleaned_text.strip(): return "Unknown (insufficient text)" vectorized_text = tfidf.transform([cleaned_text]).toarray() predicted_category = svc_model.predict(vectorized_text) return le.inverse_transform(predicted_category)[0] except Exception as e: st.error(f"Prediction error: {str(e)}") return "Unknown (prediction failed)" def display_results(resume_text, category): """Display results in a more engaging way""" st.subheader("Analysis Results") # Category display with emoji category_emojis = { "Data Science": "📊", "HR": "👥", "Design": "🎨", "Information Technology": "💻", "Education": "📚", "Business Development": "📈", "Marketing": "📢", "Sales": "💰", "Health and Fitness": "💪", "Engineering": "⚙️" } emoji = category_emojis.get(category, "🔍") st.markdown(f"""
{emoji} {category}