|
|
|
import re |
|
from typing import Dict, Any, List, Optional |
|
from datetime import datetime |
|
|
|
class LinkedInParser: |
|
"""Utility class for parsing and cleaning LinkedIn profile data""" |
|
|
|
def __init__(self): |
|
self.skill_categories = { |
|
'technical': ['python', 'javascript', 'java', 'react', 'node.js', 'sql', 'aws', 'docker'], |
|
'management': ['leadership', 'project management', 'team management', 'agile', 'scrum'], |
|
'marketing': ['seo', 'social media', 'content marketing', 'digital marketing', 'analytics'], |
|
'design': ['ui/ux', 'photoshop', 'figma', 'adobe', 'design thinking'] |
|
} |
|
|
|
def clean_profile_data(self, raw_data: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Clean and standardize raw profile data |
|
|
|
Args: |
|
raw_data (Dict[str, Any]): Raw scraped data |
|
|
|
Returns: |
|
Dict[str, Any]: Cleaned profile data |
|
""" |
|
cleaned_data = {} |
|
|
|
|
|
cleaned_data['name'] = self._clean_text(raw_data.get('name', '')) |
|
cleaned_data['headline'] = self._clean_text(raw_data.get('headline', '')) |
|
cleaned_data['location'] = self._clean_text(raw_data.get('location', '')) |
|
cleaned_data['about'] = self._clean_text(raw_data.get('about', '')) |
|
|
|
|
|
cleaned_data['experience'] = self._clean_experience_list( |
|
raw_data.get('experience', []) |
|
) |
|
|
|
|
|
cleaned_data['education'] = self._clean_education_list( |
|
raw_data.get('education', []) |
|
) |
|
|
|
|
|
cleaned_data['skills'] = self._clean_skills_list( |
|
raw_data.get('skills', []) |
|
) |
|
|
|
|
|
cleaned_data['connections'] = self._parse_connections( |
|
raw_data.get('connections', '') |
|
) |
|
|
|
cleaned_data['url'] = raw_data.get('url', '') |
|
cleaned_data['parsed_at'] = datetime.now().isoformat() |
|
|
|
return cleaned_data |
|
|
|
def extract_keywords(self, text: str, min_length: int = 3) -> List[str]: |
|
""" |
|
Extract meaningful keywords from text |
|
|
|
Args: |
|
text (str): Input text |
|
min_length (int): Minimum keyword length |
|
|
|
Returns: |
|
List[str]: Extracted keywords |
|
""" |
|
|
|
clean_text = re.sub(r'[^\w\s]', ' ', text.lower()) |
|
|
|
|
|
words = clean_text.split() |
|
|
|
|
|
stop_words = { |
|
'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', |
|
'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', |
|
'after', 'above', 'below', 'between', 'among', 'within', 'without', |
|
'under', 'over', 'is', 'are', 'was', 'were', 'be', 'been', 'being', |
|
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', |
|
'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', |
|
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', |
|
'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their' |
|
} |
|
|
|
|
|
keywords = [ |
|
word for word in words |
|
if len(word) >= min_length and word not in stop_words |
|
] |
|
|
|
|
|
unique_keywords = [] |
|
seen = set() |
|
for keyword in keywords: |
|
if keyword not in seen: |
|
unique_keywords.append(keyword) |
|
seen.add(keyword) |
|
|
|
return unique_keywords |
|
|
|
def parse_duration(self, duration_str: str) -> Dict[str, Any]: |
|
""" |
|
Parse duration strings like "2020 - Present" or "Jan 2020 - Dec 2022" |
|
|
|
Args: |
|
duration_str (str): Duration string |
|
|
|
Returns: |
|
Dict[str, Any]: Parsed duration info |
|
""" |
|
duration_info = { |
|
'raw': duration_str, |
|
'start_date': None, |
|
'end_date': None, |
|
'is_current': False, |
|
'duration_months': 0 |
|
} |
|
|
|
if not duration_str: |
|
return duration_info |
|
|
|
|
|
if 'present' in duration_str.lower(): |
|
duration_info['is_current'] = True |
|
|
|
|
|
year_pattern = r'\b(19|20)\d{2}\b' |
|
years = re.findall(year_pattern, duration_str) |
|
|
|
if years: |
|
duration_info['start_date'] = years[0] if len(years) > 0 else None |
|
duration_info['end_date'] = years[1] if len(years) > 1 else None |
|
|
|
return duration_info |
|
|
|
def categorize_skills(self, skills: List[str]) -> Dict[str, List[str]]: |
|
""" |
|
Categorize skills into different types |
|
|
|
Args: |
|
skills (List[str]): List of skills |
|
|
|
Returns: |
|
Dict[str, List[str]]: Categorized skills |
|
""" |
|
categorized = { |
|
'technical': [], |
|
'management': [], |
|
'marketing': [], |
|
'design': [], |
|
'other': [] |
|
} |
|
|
|
for skill in skills: |
|
skill_lower = skill.lower() |
|
categorized_flag = False |
|
|
|
for category, keywords in self.skill_categories.items(): |
|
if any(keyword in skill_lower for keyword in keywords): |
|
categorized[category].append(skill) |
|
categorized_flag = True |
|
break |
|
|
|
if not categorized_flag: |
|
categorized['other'].append(skill) |
|
|
|
return categorized |
|
|
|
def extract_achievements(self, text: str) -> List[str]: |
|
""" |
|
Extract achievements with numbers/metrics from text |
|
|
|
Args: |
|
text (str): Input text |
|
|
|
Returns: |
|
List[str]: List of achievements |
|
""" |
|
achievements = [] |
|
|
|
|
|
patterns = [ |
|
r'[^.]*\b\d+%[^.]*', |
|
r'[^.]*\b\d+[kK]\+?[^.]*', |
|
r'[^.]*\b\d+[mM]\+?[^.]*', |
|
r'[^.]*\$\d+[^.]*', |
|
r'[^.]*\b\d+\s*(years?|months?)[^.]*', |
|
] |
|
|
|
for pattern in patterns: |
|
matches = re.findall(pattern, text, re.IGNORECASE) |
|
achievements.extend([match.strip() for match in matches]) |
|
|
|
return achievements |
|
|
|
def _clean_text(self, text: str) -> str: |
|
"""Clean and normalize text""" |
|
if not text: |
|
return "" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
|
|
text = re.sub(r'[^\w\s\-.,!?()&/]', '', text) |
|
|
|
return text |
|
|
|
def _clean_experience_list(self, experience_list: List[Dict]) -> List[Dict]: |
|
"""Clean experience entries""" |
|
cleaned_experience = [] |
|
|
|
for exp in experience_list: |
|
if isinstance(exp, dict): |
|
cleaned_exp = { |
|
'title': self._clean_text(exp.get('title', '')), |
|
'company': self._clean_text(exp.get('company', '')), |
|
'duration': self._clean_text(exp.get('duration', '')), |
|
'description': self._clean_text(exp.get('description', '')), |
|
'location': self._clean_text(exp.get('location', '')), |
|
} |
|
|
|
|
|
cleaned_exp['duration_info'] = self.parse_duration(cleaned_exp['duration']) |
|
|
|
|
|
cleaned_exp['achievements'] = self.extract_achievements( |
|
cleaned_exp['description'] |
|
) |
|
|
|
cleaned_experience.append(cleaned_exp) |
|
|
|
return cleaned_experience |
|
|
|
def _clean_education_list(self, education_list: List[Dict]) -> List[Dict]: |
|
"""Clean education entries""" |
|
cleaned_education = [] |
|
|
|
for edu in education_list: |
|
if isinstance(edu, dict): |
|
cleaned_edu = { |
|
'degree': self._clean_text(edu.get('degree', '')), |
|
'school': self._clean_text(edu.get('school', '')), |
|
'year': self._clean_text(edu.get('year', '')), |
|
'field': self._clean_text(edu.get('field', '')), |
|
} |
|
cleaned_education.append(cleaned_edu) |
|
|
|
return cleaned_education |
|
|
|
def _clean_skills_list(self, skills_list: List[str]) -> List[str]: |
|
"""Clean and deduplicate skills""" |
|
if not skills_list: |
|
return [] |
|
|
|
cleaned_skills = [] |
|
seen_skills = set() |
|
|
|
for skill in skills_list: |
|
cleaned_skill = self._clean_text(str(skill)) |
|
skill_lower = cleaned_skill.lower() |
|
|
|
if cleaned_skill and skill_lower not in seen_skills: |
|
cleaned_skills.append(cleaned_skill) |
|
seen_skills.add(skill_lower) |
|
|
|
return cleaned_skills |
|
|
|
def _parse_connections(self, connections_str: str) -> int: |
|
"""Parse connection count from string""" |
|
if not connections_str: |
|
return 0 |
|
|
|
|
|
numbers = re.findall(r'\d+', connections_str) |
|
|
|
if numbers: |
|
return int(numbers[0]) |
|
|
|
|
|
if '500+' in connections_str: |
|
return 500 |
|
|
|
return 0 |
|
|