|
import os |
|
import time |
|
import json |
|
import requests |
|
from typing import Dict, Any |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
class ScraperAgent: |
|
"""Agent responsible for extracting data from LinkedIn profiles using Apify REST API""" |
|
|
|
def __init__(self): |
|
self.apify_token = os.getenv('APIFY_API_TOKEN') |
|
if not self.apify_token: |
|
raise ValueError("APIFY_API_TOKEN not found in environment variables") |
|
|
|
|
|
if not self.apify_token.startswith('apify_api_'): |
|
print(f"β οΈ Warning: Token doesn't start with 'apify_api_'. Current token starts with: {self.apify_token[:10]}...") |
|
|
|
|
|
self.api_url = f"https://api.apify.com/v2/acts/dev_fusion~linkedin-profile-scraper/run-sync-get-dataset-items?token={self.apify_token}" |
|
|
|
print(f"π Using Apify token: {self.apify_token[:15]}...") |
|
|
|
def extract_profile_data(self, linkedin_url: str) -> Dict[str, Any]: |
|
""" |
|
Extract profile data from LinkedIn URL using Apify REST API |
|
|
|
Args: |
|
linkedin_url (str): LinkedIn profile URL |
|
|
|
Returns: |
|
Dict[str, Any]: Extracted profile data |
|
""" |
|
try: |
|
print(f"π Starting scraping for: {linkedin_url}") |
|
print(f"π URL being processed: {linkedin_url}") |
|
print(f"β° Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}") |
|
|
|
|
|
original_url = linkedin_url |
|
linkedin_url = linkedin_url.strip() |
|
if not linkedin_url.startswith('http'): |
|
linkedin_url = 'https://' + linkedin_url |
|
|
|
print(f"π§Ή Cleaned URL: {linkedin_url}") |
|
|
|
|
|
if original_url != linkedin_url: |
|
print(f"π URL normalized: {original_url} β {linkedin_url}") |
|
|
|
|
|
run_input = { |
|
"profileUrls": [linkedin_url], |
|
"slowDown": True, |
|
"includeSkills": True, |
|
"includeExperience": True, |
|
"includeEducation": True, |
|
"includeRecommendations": False, |
|
"saveHtml": False, |
|
"saveMarkdown": False |
|
} |
|
|
|
print(f"π Apify input: {json.dumps(run_input, indent=2)}") |
|
|
|
|
|
print("π Running Apify scraper via REST API...") |
|
response = requests.post( |
|
self.api_url, |
|
json=run_input, |
|
headers={'Content-Type': 'application/json'}, |
|
timeout=180 |
|
) |
|
|
|
if response.status_code in [200, 201]: |
|
results = response.json() |
|
print(f"β
API Response received: {len(results)} items") |
|
|
|
if results and len(results) > 0: |
|
|
|
raw_data = results[0] |
|
processed_data = self._process_apify_data(raw_data, linkedin_url) |
|
print("β
Successfully extracted and processed profile data") |
|
return processed_data |
|
else: |
|
error_msg = "No data returned from Apify API. The profile may be private or the scraper encountered an issue." |
|
print(f"β {error_msg}") |
|
raise ValueError(error_msg) |
|
else: |
|
error_details = "" |
|
try: |
|
error_response = response.json() |
|
error_details = f" - {error_response.get('error', {}).get('message', response.text)}" |
|
except: |
|
error_details = f" - {response.text}" |
|
|
|
if response.status_code == 401: |
|
error_msg = f"Authentication failed (401): Invalid or expired API token{error_details}" |
|
print(f"β {error_msg}") |
|
print(f"π Token being used: {self.apify_token[:15]}...") |
|
print(f"π‘ Please check your APIFY_API_TOKEN in your .env file") |
|
elif response.status_code == 404: |
|
error_msg = f"Actor not found (404): The actor 'dev_fusion~linkedin-profile-scraper' may not exist{error_details}" |
|
print(f"β {error_msg}") |
|
elif response.status_code == 429: |
|
error_msg = f"Rate limit exceeded (429): Too many requests{error_details}" |
|
print(f"β {error_msg}") |
|
else: |
|
error_msg = f"API request failed with status {response.status_code}{error_details}" |
|
print(f"β {error_msg}") |
|
|
|
raise requests.RequestException(error_msg) |
|
|
|
except requests.Timeout: |
|
error_msg = "Request timed out. The scraping operation took too long to complete." |
|
print(f"β° {error_msg}") |
|
raise requests.Timeout(error_msg) |
|
except Exception as e: |
|
error_msg = f"Error extracting profile data: {str(e)}" |
|
print(f"β {error_msg}") |
|
raise Exception(error_msg) |
|
|
|
def test_apify_connection(self) -> bool: |
|
"""Test if Apify connection is working""" |
|
try: |
|
|
|
test_url = f"https://api.apify.com/v2/acts/dev_fusion~linkedin-profile-scraper?token={self.apify_token}" |
|
print(f"π Testing connection to: {test_url[:50]}...") |
|
|
|
response = requests.get(test_url, timeout=10) |
|
|
|
if response.status_code == 200: |
|
actor_info = response.json() |
|
print(f"β
Successfully connected to Apify actor: {actor_info.get('name', 'LinkedIn Profile Scraper')}") |
|
return True |
|
elif response.status_code == 401: |
|
print(f"β Authentication failed (401): Invalid or expired API token") |
|
print(f"π Token being used: {self.apify_token[:15]}...") |
|
print(f"π‘ Please check your APIFY_API_TOKEN in your .env file") |
|
return False |
|
elif response.status_code == 404: |
|
print(f"β Actor not found (404): The actor 'dev_fusion~linkedin-profile-scraper' may not exist or be accessible") |
|
return False |
|
else: |
|
print(f"β Failed to connect to Apify: {response.status_code} - {response.text}") |
|
return False |
|
except Exception as e: |
|
print(f"β Failed to connect to Apify: {str(e)}") |
|
return False |
|
|
|
def _process_apify_data(self, raw_data: Dict[str, Any], url: str) -> Dict[str, Any]: |
|
"""Process raw Apify data into standardized format""" |
|
|
|
print(f"π Processing data for URL: {url}") |
|
print(f"π Raw data keys: {list(raw_data.keys())}") |
|
|
|
|
|
profile_data = { |
|
'name': raw_data.get('fullName', ''), |
|
'headline': raw_data.get('headline', ''), |
|
'location': raw_data.get('addressWithCountry', raw_data.get('addressWithoutCountry', '')), |
|
'about': raw_data.get('about', ''), |
|
'connections': raw_data.get('connections', 0), |
|
'followers': raw_data.get('followers', 0), |
|
'email': raw_data.get('email', ''), |
|
'url': url, |
|
'profile_image': raw_data.get('profilePic', ''), |
|
'profile_image_hq': raw_data.get('profilePicHighQuality', ''), |
|
'scraped_at': time.strftime('%Y-%m-%d %H:%M:%S'), |
|
'job_title': raw_data.get('jobTitle', ''), |
|
'company_name': raw_data.get('companyName', ''), |
|
'company_industry': raw_data.get('companyIndustry', ''), |
|
'company_website': raw_data.get('companyWebsite', ''), |
|
'company_size': raw_data.get('companySize', ''), |
|
'current_job_duration': raw_data.get('currentJobDuration', ''), |
|
'top_skills': raw_data.get('topSkillsByEndorsements', '') |
|
} |
|
|
|
print(f"β
Extracted profile for: {profile_data.get('name', 'Unknown')}") |
|
print(f"π Profile URL stored: {profile_data['url']}") |
|
|
|
|
|
experience_list = [] |
|
for exp in raw_data.get('experiences', []): |
|
experience_item = { |
|
'title': exp.get('title', ''), |
|
'company': exp.get('subtitle', '').replace(' Β· Full-time', '').replace(' Β· Part-time', ''), |
|
'duration': exp.get('caption', ''), |
|
'description': '', |
|
'location': exp.get('metadata', ''), |
|
'company_logo': exp.get('logo', ''), |
|
'is_current': 'Present' in exp.get('caption', '') or 'Β·' not in exp.get('caption', '') |
|
} |
|
|
|
|
|
if 'subComponents' in exp and exp['subComponents']: |
|
for sub in exp['subComponents']: |
|
if 'description' in sub and sub['description']: |
|
descriptions = [] |
|
for desc in sub['description']: |
|
if isinstance(desc, dict) and desc.get('text'): |
|
descriptions.append(desc['text']) |
|
experience_item['description'] = ' '.join(descriptions) |
|
|
|
experience_list.append(experience_item) |
|
profile_data['experience'] = experience_list |
|
|
|
|
|
education_list = [] |
|
for edu in raw_data.get('educations', []): |
|
education_item = { |
|
'degree': edu.get('subtitle', ''), |
|
'school': edu.get('title', ''), |
|
'field': '', |
|
'year': edu.get('caption', ''), |
|
'logo': edu.get('logo', ''), |
|
'grade': '' |
|
} |
|
|
|
|
|
subtitle = edu.get('subtitle', '') |
|
if ' - ' in subtitle: |
|
parts = subtitle.split(' - ', 1) |
|
education_item['degree'] = parts[0] |
|
education_item['field'] = parts[1] if len(parts) > 1 else '' |
|
elif ', ' in subtitle: |
|
parts = subtitle.split(', ', 1) |
|
education_item['degree'] = parts[0] |
|
education_item['field'] = parts[1] if len(parts) > 1 else '' |
|
|
|
|
|
if 'subComponents' in edu and edu['subComponents']: |
|
for sub in edu['subComponents']: |
|
if 'description' in sub and sub['description']: |
|
for desc in sub['description']: |
|
if isinstance(desc, dict) and desc.get('text', '').startswith('Grade:'): |
|
education_item['grade'] = desc['text'] |
|
|
|
education_list.append(education_item) |
|
profile_data['education'] = education_list |
|
|
|
|
|
skills_list = [] |
|
for skill in raw_data.get('skills', []): |
|
if isinstance(skill, dict) and 'title' in skill: |
|
skills_list.append(skill['title']) |
|
elif isinstance(skill, str): |
|
skills_list.append(skill) |
|
profile_data['skills'] = skills_list |
|
|
|
|
|
certifications_list = [] |
|
for cert in raw_data.get('licenseAndCertificates', []): |
|
cert_item = { |
|
'title': cert.get('title', ''), |
|
'issuer': cert.get('subtitle', ''), |
|
'date': cert.get('caption', ''), |
|
'credential_id': cert.get('metadata', ''), |
|
'logo': cert.get('logo', '') |
|
} |
|
certifications_list.append(cert_item) |
|
profile_data['certifications'] = certifications_list |
|
|
|
|
|
profile_data['languages'] = raw_data.get('languages', []) |
|
|
|
|
|
volunteer_list = [] |
|
for vol in raw_data.get('volunteerAndAwards', []): |
|
if isinstance(vol, dict): |
|
volunteer_list.append(vol) |
|
profile_data['volunteer_experience'] = volunteer_list |
|
|
|
|
|
profile_data['honors_awards'] = raw_data.get('honorsAndAwards', []) |
|
profile_data['projects'] = raw_data.get('projects', []) |
|
profile_data['publications'] = raw_data.get('publications', []) |
|
profile_data['recommendations'] = raw_data.get('recommendations', []) |
|
profile_data['interests'] = raw_data.get('interests', []) |
|
|
|
return profile_data |
|
|