import streamlit as st import pickle import docx import PyPDF2 import re from huggingface_hub import hf_hub_download from io import BytesIO # Set page configuration st.set_page_config( page_title="IntelliCV: AI Resume Analyzer", page_icon="📄", layout="centered", initial_sidebar_state="expanded" ) # Add some basic CSS styling directly in the app st.markdown(""" """, unsafe_allow_html=True) # Cache model loading with improved error handling @st.cache_resource(show_spinner="Loading AI models...") def load_models(): try: repo_id = "psychomita/intellicv-models" clf_path = hf_hub_download(repo_id=repo_id, filename="clf.pkl", repo_type="model") tfidf_path = hf_hub_download(repo_id=repo_id, filename="tfidf.pkl", repo_type="model") encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pkl", repo_type="model") svc_model = pickle.load(open(clf_path, 'rb')) tfidf = pickle.load(open(tfidf_path, 'rb')) le = pickle.load(open(encoder_path, 'rb')) return svc_model, tfidf, le except Exception as e: st.error(f"Failed to load models: {str(e)}") return None, None, None # Load models svc_model, tfidf, le = load_models() def cleanResume(txt): """Improved text cleaning function with more comprehensive patterns""" if not isinstance(txt, str): return "" clean_patterns = [ (r'http\S+|www\S+|https\S+', ' '), # URLs (r'\bRT\b|\bretweet\b', ' '), # Retweets (r'#\S+', ' '), # Hashtags (r'@\S+', ' '), # Mentions (r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' '), # Punctuation (r'[^\x00-\x7f]', ' '), # Non-ASCII chars (r'\s+', ' '), # Extra whitespace (r'\b\d+\b', ' '), # Standalone numbers (r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ') # Email addresses ] cleanText = txt for pattern, repl in clean_patterns: cleanText = re.sub(pattern, repl, cleanText) return cleanText.strip() def extract_text_from_pdf(file): """Improved PDF text extraction with error handling""" try: pdf_reader = PyPDF2.PdfReader(BytesIO(file.read())) text = [] for page in pdf_reader.pages: try: page_text = page.extract_text() or '' text.append(page_text) except Exception as e: st.warning(f"Could not extract text from one page: {str(e)}") continue return ' '.join(text) except Exception as e: raise ValueError(f"Failed to read PDF file: {str(e)}") def extract_text_from_docx(file): """Improved DOCX text extraction with error handling""" try: doc = docx.Document(BytesIO(file.read())) return '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()) except Exception as e: raise ValueError(f"Failed to read DOCX file: {str(e)}") def extract_text_from_txt(file): """Improved text file extraction with multiple encoding attempts""" encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252'] for encoding in encodings: try: return BytesIO(file.read()).read().decode(encoding) except UnicodeDecodeError: file.seek(0) raise ValueError("Failed to decode text file with common encodings") def handle_file_upload(uploaded_file): """Handle file upload with better type checking""" if not uploaded_file: return None file_extension = uploaded_file.name.split('.')[-1].lower() try: if file_extension == 'pdf': return extract_text_from_pdf(uploaded_file) elif file_extension == 'docx': return extract_text_from_docx(uploaded_file) elif file_extension == 'txt': return extract_text_from_txt(uploaded_file) else: raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.") except Exception as e: raise ValueError(f"Error processing file: {str(e)}") def predict_category(input_resume): """Make prediction with input validation""" if not input_resume or not isinstance(input_resume, str): return "Unknown" try: cleaned_text = cleanResume(input_resume) if not cleaned_text.strip(): return "Unknown (insufficient text)" vectorized_text = tfidf.transform([cleaned_text]).toarray() predicted_category = svc_model.predict(vectorized_text) return le.inverse_transform(predicted_category)[0] except Exception as e: st.error(f"Prediction error: {str(e)}") return "Unknown (prediction failed)" def display_results(resume_text, category): """Display results in a more engaging way""" st.subheader("Analysis Results") # Category display with emoji category_emojis = { "Data Science": "📊", "HR": "👥", "Design": "🎨", "Information Technology": "💻", "Education": "📚", "Business Development": "📈", "Marketing": "📢", "Sales": "💰", "Health and Fitness": "💪", "Engineering": "⚙️" } emoji = category_emojis.get(category, "🔍") st.markdown(f"""

Predicted Job Category

{emoji} {category}

""", unsafe_allow_html=True) # Text analysis section with st.expander("Text Analysis Details"): st.markdown("**Cleaned Text Excerpt:**") cleaned_text = cleanResume(resume_text) st.text(cleaned_text[:500] + "..." if len(cleaned_text) > 500 else cleaned_text) st.markdown("**Statistics:**") col1, col2, col3 = st.columns(3) col1.metric("Original Length", f"{len(resume_text):,} chars") col2.metric("Cleaned Length", f"{len(cleaned_text):,} chars") col3.metric("Reduction", f"{100 - (len(cleaned_text)/len(resume_text)*100 if resume_text else 0):.1f}%") def main(): """Main application function with improved UI""" st.title("📄 IntelliCV: AI-Powered Resume Analyzer") st.markdown(""" Upload your resume and discover which job category it best matches with our AI analysis. Supported formats: PDF, DOCX, and TXT. """) # Sidebar with additional info with st.sidebar: st.header("About") st.markdown(""" IntelliCV uses machine learning to analyze resume content and predict the most suitable job category. **How it works:** 1. Upload your resume 2. AI extracts and cleans the text 3. Our model predicts the job category 4. View detailed analysis """) st.markdown("---") st.markdown(""" **Common Categories:** - Data Science - HR - Design - Information Technology - Education - And more... """) # File upload section uploaded_file = st.file_uploader( "Upload your resume", type=["pdf", "docx", "txt"], help="Supported formats: PDF, DOCX, TXT (max 10MB)" ) if uploaded_file is not None: # Check file size if uploaded_file.size > 10 * 1024 * 1024: # 10MB limit st.error("File size exceeds 10MB limit. Please upload a smaller file.") return with st.spinner("🔍 Analyzing your resume..."): try: resume_text = handle_file_upload(uploaded_file) if not resume_text.strip(): st.warning("The uploaded file appears to be empty or couldn't be read properly.") return category = predict_category(resume_text) display_results(resume_text, category) # Download button for cleaned text st.download_button( label="Download Cleaned Text", data=resume_text, file_name="cleaned_resume.txt", mime="text/plain" ) except Exception as e: st.error(f"An error occurred: {str(e)}") st.info("Please try another file or check the format.") if __name__ == "__main__": if svc_model and tfidf and le: main() else: st.error("Failed to load required models. Please try again later.")