diff --git a/.gitignore b/.gitignore index 6a191b2f..6779fff5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,37 @@ -/venv/ -/.idea -**/__pycache__/ -**/.pytest_cache/ -/.ipynb_checkpoints/ -**/output/ -**/.DS_Store -*.pyc -.env -dist \ No newline at end of file +# Sensitive configuration +config_sensitive.py + +# Generated files +*.csv +*.log + +# Python +__pycache__/ +*.py[cod] +*$py.class +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 00000000..99693648 --- /dev/null +++ b/config.py @@ -0,0 +1,23 @@ +# Default configuration that can be committed +DEFAULT_CONFIG = { + 'search_term': 'IT Engineer', + 'location': 'Lone Tree, CO', + 'distance': 25, + 'results_wanted': 50, + 'job_type': 'fulltime', + 'hours_old': 72, + 'search_sites': ["indeed", "linkedin"], + 'exclude_clearance': True, + 'clearance_keywords': [ + 'clearance', 'security clearance', 'secret', 'top secret', + 'ts/sci', 'sci', 'classified', 'poly', 'polygraph', + 'public trust', 'security+', 'security plus' + ] +} + + +try: + # Try to import sensitive config from a local file + from .config_sensitive import SENSITIVE_CONFIG +except ImportError: + print("Warning: No sensitive configuration found. Using defaults.") \ No newline at end of file diff --git a/config_sensitive_template.py b/config_sensitive_template.py new file mode 100644 index 00000000..f8adc7e4 --- /dev/null +++ b/config_sensitive_template.py @@ -0,0 +1,49 @@ +""" +JobSpy Sensitive Configuration Template +===================================== + +Setup Instructions: +1. Copy this file to 'config_sensitive.py' +2. Fill in your actual values +3. Keep config_sensitive.py in .gitignore + +Security Best Practices: +- Never commit config_sensitive.py to version control +- Store proxy credentials securely +- Rotate credentials regularly +- Use environment variables when possible +""" + +SENSITIVE_CONFIG = { + 'proxy_enabled': True, # Set to False to disable proxy usage + + # Add your proxy URLs here (at least one required if proxy_enabled is True) + 'proxy_list': [ + "http://your-username:your-password@your-proxy-host:port", + "http://your-backup-proxy-url:port" # Optional backup proxy + ], + + # IP verification services (can be customized) + 'proxy_verification_urls': [ + 'http://api.ipify.org?format=json', + 'http://ip-api.com/json', + 'http://ifconfig.me/ip' + ], + + # Advanced Settings + 'proxy_timeout': 10, # Seconds to wait for proxy response + 'max_retries': 3, # Maximum retry attempts per proxy + 'rotate_interval': 100, # Rotate proxy after N requests + 'verify_ssl': False # Disable for some proxy configurations +} + +""" +Example format for proxy_list entries: +- Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225" +- Generic format: "http://username:password@host:port" + +Security Notes: +1. Never commit config_sensitive.py to version control +2. Keep your proxy credentials secure +3. Regularly rotate proxy credentials if possible +""" \ No newline at end of file diff --git a/job_search.py b/job_search.py new file mode 100644 index 00000000..ba632724 --- /dev/null +++ b/job_search.py @@ -0,0 +1,109 @@ +from jobspy import scrape_jobs +import time +import certifi +import pandas as pd +import csv +from datetime import datetime +from typing import Optional, List + +def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame: + """Filter out jobs requiring security clearance""" + clearance_keywords = [ + 'clearance', 'security clearance', 'secret', 'top secret', + 'ts/sci', 'sci', 'classified', 'poly', 'polygraph', + 'public trust', 'security+', 'security plus' + ] + + # Create a pattern matching any clearance keyword + pattern = '|'.join(clearance_keywords) + + # Filter out jobs where title or description contains clearance keywords + mask = ~( + df['title'].str.lower().str.contains(pattern, na=False) | + df['description'].str.lower().str.contains(pattern, na=False) + ) + + return df[mask] + +def search_tech_jobs( + search_sites: List[str] = ["indeed", "linkedin"], + exclude_clearance: bool = False +) -> Optional[pd.DataFrame]: + + # Search configuration + search_config = { + 'search_term': 'IT Engineer', + 'location': 'Lone Tree, CO', + 'distance': 25, + 'results_wanted': 50, + 'job_type': 'fulltime', + 'hours_old': 72 + } + + try: + print(f"Searching for: {search_config['search_term']} in {search_config['location']}") + print(f"Distance: {search_config['distance']} miles") + print(f"Job Type: {search_config['job_type']}") + print(f"Posts from last: {search_config['hours_old']} hours") + print(f"Excluding clearance jobs: {exclude_clearance}") + print(f"Searching on: {', '.join(search_sites)}") + + jobs = scrape_jobs( + site_name=search_sites, + search_term=search_config['search_term'], + location=search_config['location'], + distance=search_config['distance'], + results_wanted=search_config['results_wanted'], + job_type=search_config['job_type'], + hours_old=search_config['hours_old'], + country_indeed="USA", + description_format="markdown", + verbose=2 + ) + + if isinstance(jobs, pd.DataFrame) and not jobs.empty: + print(f"\nInitial jobs found: {len(jobs)}") + + if exclude_clearance: + original_count = len(jobs) + jobs = filter_clearance_jobs(jobs) + filtered_count = len(jobs) + print(f"Removed {original_count - filtered_count} jobs requiring clearance") + + # Save results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_filename = f"it_jobs_{timestamp}.csv" + + # Print job summary + print("\nJob Listings Found:") + print("-------------------") + for idx, job in jobs.iterrows(): + print(f"\n{idx + 1}. {job.get('title', 'No title')}") + print(f" Company: {job.get('company', 'No company')}") + print(f" Location: {job.get('location', 'No location')}") + print(f" Source: {job.get('site', 'No source')}") + print(f" Date Posted: {job.get('date_posted', 'No date')}") + + jobs.to_csv(csv_filename, index=False) + print(f"\nResults saved to: {csv_filename}") + return jobs + + print("No jobs found with current search parameters.") + return None + + except Exception as e: + print(f"\nError during search:") + print(f"Error details: {str(e)}") + return None + +if __name__ == "__main__": + print("Starting job search...") + jobs = search_tech_jobs(exclude_clearance=True) + + if jobs is not None and not jobs.empty: + print("\nSearch completed successfully!") + print(f"Total jobs found: {len(jobs)}") + print("\nJobs by source:") + print(jobs['site'].value_counts()) + else: + print("\nNo results found. Try adjusting search parameters.") \ No newline at end of file diff --git a/job_search_advanced.py b/job_search_advanced.py new file mode 100644 index 00000000..84949a22 --- /dev/null +++ b/job_search_advanced.py @@ -0,0 +1,365 @@ +import csv +from jobspy import scrape_jobs +from datetime import datetime +import certifi +import time +from typing import Optional, List, Dict, Any, Union +import pandas as pd +import requests +import sys +from requests import Session + +def fix_linkedin_url(url: str) -> str: + """Fix incomplete LinkedIn URLs.""" + if not url or 'linkedin' not in url: + return url + + # If URL is truncated, try to reconstruct it + if url.startswith('https://www.linkedin') and '/jobs/view/' not in url: + # Extract the job ID if present + job_id = url.split('/')[-1] if url.split('/')[-1].isdigit() else None + if job_id: + return f"https://www.linkedin.com/jobs/view/{job_id}" + return url + +def clean_job_data(jobs_df): + """Clean and validate job data.""" + # Fix LinkedIn URLs + jobs_df['job_url'] = jobs_df.apply( + lambda row: fix_linkedin_url(row['job_url']) if row['site'] == 'linkedin' else row['job_url'], + axis=1 + ) + + # Remove rows with missing essential data + essential_columns = ['title', 'company', 'location', 'job_url'] + jobs_df = jobs_df.dropna(subset=essential_columns) + + # Clean up location data + jobs_df['location'] = jobs_df['location'].fillna('Location not specified') + + # Ensure description exists + jobs_df['description'] = jobs_df['description'].fillna('No description available') + + return jobs_df + +def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame: + """Filter out jobs requiring security clearance""" + clearance_keywords = [ + 'clearance', 'security clearance', 'secret', 'top secret', + 'ts/sci', 'sci', 'classified', 'poly', 'polygraph', + 'public trust', 'security+', 'security plus' + ] + + # Create a pattern matching any clearance keyword + pattern = '|'.join(clearance_keywords) + + # Filter out jobs where title or description contains clearance keywords + mask = ~( + df['title'].str.lower().str.contains(pattern, na=False) | + df['description'].str.lower().str.contains(pattern, na=False) + ) + + return df[mask] + +def verify_proxy(proxy: str) -> bool: + """Enhanced proxy verification""" + try: + # Check multiple IP verification services + verification_urls = [ + 'http://api.ipify.org?format=json', + 'http://ip-api.com/json', + 'http://ifconfig.me/ip' + ] + + # First check real IP (only first 3 digits for security) + real_ips = [] + for url in verification_urls: + try: + response = requests.get(url, timeout=5) + if response.ok: + ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text) + real_ips.append(ip) + break + except: + continue + + if not real_ips: + print("Could not verify real IP") + return False + + real_ip = real_ips[0] + + # Check with proxy + proxies = { + 'http': proxy, + 'https': proxy + } + + # Configure session to handle SSL issues + session = requests.Session() + session.verify = False + requests.packages.urllib3.disable_warnings() + + proxy_ips = [] + for url in verification_urls: + try: + response = session.get(url, proxies=proxies, timeout=10) + if response.ok: + ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text) + proxy_ips.append(ip) + break + except: + continue + + if not proxy_ips: + print("Could not verify proxy IP") + return False + + proxy_ip = proxy_ips[0] + + if real_ip != proxy_ip: + print(f"\nProxy verification successful!") + print(f"Real IP: {real_ip[:3]}... (hidden for security)") + print(f"Proxy IP: {proxy_ip}") + print(f"IP Verification Service: {url}") + return True + else: + print("\nWarning: Proxy not working - IPs match!") + return False + + except Exception as e: + print(f"\nProxy verification failed: {str(e)}") + return False + +def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]: + """Verify proxy usage and return traffic stats""" + start_size = 0 + response = session.get(url, stream=True) + content_size = len(response.content) + + return { + "status_code": response.status_code, + "content_size": content_size, + "headers": dict(response.headers), + "proxy_used": bool(session.proxies) + } + +def search_tech_jobs_with_proxies() -> Optional[pd.DataFrame]: + # Comprehensive search configuration + search_config = { + # Search parameters + 'search_term': 'IT Engineer', + 'location': 'Lone Tree, CO', + 'distance': 25, + 'results_wanted': 50, + 'job_type': 'fulltime', + 'hours_old': 72, + + # Filter settings + 'exclude_clearance': True, + 'search_sites': ["indeed", "linkedin"], + + # Proxy settings + 'use_proxy': True, # Proxy kill switch + 'proxy_list': [ + "http://brd-customer-hl_92b00ed6-zone-residential_proxies_us:5t01plrkfs6y@brd.superproxy.io:33335", + "http://brd-customer-hl_92b00ed6-zone-residential_proxy2_us:uyfjctxhc8t4@brd.superproxy.io:33335" + ], + + # Clearance keywords to filter + 'clearance_keywords': [ + 'clearance', 'security clearance', 'secret', 'top secret', + 'ts/sci', 'sci', 'classified', 'poly', 'polygraph', + 'public trust', 'security+', 'security plus' + ], + + # Additional settings for better results + 'max_retries_per_proxy': 2, # Number of retries per proxy + 'verify_timeout': 15, # Timeout for proxy verification + 'date_format': '%Y-%m-%d', # Standardize date format + 'strict_location': True, # Enforce stricter location filtering + + # Location verification + 'location_center': { + 'lat': 39.5486, # Lone Tree coordinates + 'lon': -104.8719 + }, + 'max_distance': 25, # miles + + # Debug settings + 'show_filtered_jobs': False, # Option to show filtered out jobs + 'debug_mode': False, # Additional debugging information + 'debug': { + 'show_traffic': True, + 'log_requests': True, + 'show_proxy_usage': True + } + } + + max_retries = 3 + retry_count = 0 + + # Proxy verification and kill switch + if search_config['use_proxy']: + print("\nVerifying proxy configuration...") + proxy_verified = False + for proxy in search_config['proxy_list']: + if verify_proxy(proxy): + proxy_verified = True + break + + if not proxy_verified: + print("\nNo working proxies found! Exiting for safety...") + sys.exit(1) + else: + print("\nWARNING: Running without proxy! This may result in IP blocking.") + user_input = input("Continue without proxy? (yes/no): ") + if user_input.lower() != 'yes': + print("Exiting...") + sys.exit(0) + + while retry_count < max_retries: + current_proxy = search_config['proxy_list'][retry_count % len(search_config['proxy_list'])] if search_config['use_proxy'] else None + + try: + print(f"\nAttempt {retry_count + 1} of {max_retries}") + if current_proxy: + print(f"Using proxy: {current_proxy}") + print(f"Searching for: {search_config['search_term']} in {search_config['location']}") + print(f"Distance: {search_config['distance']} miles") + print(f"Job Type: {search_config['job_type']}") + print(f"Posts from last: {search_config['hours_old']} hours") + print(f"Excluding clearance jobs: {search_config['exclude_clearance']}") + print(f"Searching on: {', '.join(search_config['search_sites'])}") + + jobs = scrape_jobs( + site_name=search_config['search_sites'], + search_term=search_config['search_term'], + location=search_config['location'], + distance=search_config['distance'], + results_wanted=search_config['results_wanted'], + job_type=search_config['job_type'], + hours_old=search_config['hours_old'], + country_indeed="USA", + description_format="markdown", + verbose=2, + proxy=current_proxy, + verify=False if current_proxy else certifi.where(), # Disable SSL verify when using proxy + ) + + if not isinstance(jobs, pd.DataFrame): + print("Invalid response format from job search.") + retry_count += 1 + continue + + if jobs.empty: + print("No jobs found with current search parameters.") + retry_count += 1 + continue + + print(f"\nInitial jobs found: {len(jobs)}") + + # Track filtered jobs + filtered_jobs = { + 'clearance': 0, + 'location': 0, + 'date': 0 + } + + if search_config['exclude_clearance']: + original_count = len(jobs) + pattern = '|'.join(search_config['clearance_keywords']) + clearance_mask = ~( + jobs['title'].str.lower().str.contains(pattern, na=False) | + jobs['description'].str.lower().str.contains(pattern, na=False) + ) + filtered_jobs['clearance'] = original_count - len(jobs[clearance_mask]) + jobs = jobs[clearance_mask] + + # Fix date formatting + jobs['date_posted'] = pd.to_datetime(jobs['date_posted'], errors='coerce') + date_mask = jobs['date_posted'].notna() + filtered_jobs['date'] = len(jobs) - len(jobs[date_mask]) + jobs = jobs[date_mask] + + # Location filtering + if search_config['strict_location']: + location_mask = jobs['location'].apply( + lambda x: is_within_radius(x, + search_config['location_center'], + search_config['max_distance']) + ) + filtered_jobs['location'] = len(jobs) - len(jobs[location_mask]) + jobs = jobs[location_mask] + + # Print filtering summary + print("\nFiltering Summary:") + for reason, count in filtered_jobs.items(): + if count > 0: + print(f"Removed {count} jobs due to {reason}") + + # Save results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_filename = f"it_jobs_{timestamp}.csv" + + # Print job summary + print("\nJob Listings Found:") + print("-------------------") + for idx, job in jobs.iterrows(): + print(f"\n{idx + 1}. {job.get('title', 'No title')}") + print(f" Company: {job.get('company', 'No company')}") + print(f" Location: {job.get('location', 'No location')}") + print(f" Source: {job.get('site', 'No source')}") + print(f" Date Posted: {job.get('date_posted', 'No date')}") + + # Save to CSV + jobs.to_csv( + csv_filename, + quoting=csv.QUOTE_NONNUMERIC, + escapechar="\\", + index=False + ) + + print(f"\nResults saved to: {csv_filename}") + return jobs + + except Exception as e: + print(f"\nError with proxy {current_proxy}:") + print(f"Error details: {str(e)}") + retry_count += 1 + + if retry_count < max_retries: + wait_time = 5 * (retry_count) + print(f"\nWaiting {wait_time} seconds before trying next proxy...") + time.sleep(wait_time) + else: + print("\nAll attempts failed. Please try again later.") + + return None + +def calculate_distance(job_location, search_location): + """ + Placeholder for distance calculation. + In a full implementation, this would use geocoding and actual distance calculation. + """ + return "Unknown" # Would need geocoding API to calculate actual distances + +def is_within_radius(job_location: str, center: dict, max_distance: int) -> bool: + """Verify if job location is within specified radius""" + try: + # Add geocoding logic here if needed + return True # Placeholder for now + except Exception: + return False + +if __name__ == "__main__": + print("Starting job search...") + jobs = search_tech_jobs_with_proxies() + + if jobs is not None and not jobs.empty: + print("\nSearch completed successfully!") + print(f"Total jobs found: {len(jobs)}") + print("\nJobs by source:") + print(jobs['site'].value_counts()) + else: + print("\nNo results found. Try adjusting search parameters.") diff --git a/proxy_utils.py b/proxy_utils.py new file mode 100644 index 00000000..e6fb050b --- /dev/null +++ b/proxy_utils.py @@ -0,0 +1,88 @@ +from typing import Dict, Any, Optional +from requests import Session, Response +import requests +import warnings +import urllib3 + +# Suppress SSL warnings +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +def verify_proxy(proxy: str, verification_urls: list) -> bool: + """Verify proxy is working and hiding the real IP""" + try: + # First check real IP + real_ip = get_real_ip(verification_urls) + if not real_ip: + print("Could not verify real IP") + return False + + proxy_ip = get_proxy_ip(proxy, verification_urls) + if not proxy_ip: + print("Could not verify proxy IP") + return False + + if real_ip != proxy_ip: + print(f"\nProxy verification successful!") + print(f"Real IP: {real_ip[:3]}... (hidden for security)") + print(f"Proxy IP: {proxy_ip}") + return True + else: + print("\nWarning: Proxy not working - IPs match!") + return False + + except Exception as e: + print(f"\nProxy verification failed: {str(e)}") + return False + +def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]: + """Verify proxy usage and return traffic stats""" + try: + response = session.get(url, stream=True) + content_size = len(response.content) + + return { + "status_code": response.status_code, + "content_size": content_size, + "headers": dict(response.headers), + "proxy_used": bool(session.proxies) + } + except Exception as e: + print(f"Error tracking proxy usage: {str(e)}") + return { + "status_code": 0, + "content_size": 0, + "headers": {}, + "proxy_used": False + } + +def get_real_ip(verification_urls: list) -> Optional[str]: + """Get real IP address without proxy""" + for url in verification_urls: + try: + response = requests.get(url, timeout=5) + if response.ok: + return extract_ip(response, url) + except: + continue + return None + +def get_proxy_ip(proxy: str, verification_urls: list) -> Optional[str]: + """Get IP address when using proxy""" + proxies = {'http': proxy, 'https': proxy} + session = requests.Session() + session.verify = False + + for url in verification_urls: + try: + response = session.get(url, proxies=proxies, timeout=10) + if response.ok: + return extract_ip(response, url) + except: + continue + return None + +def extract_ip(response: Response, url: str) -> str: + """Extract IP from response based on service used""" + if 'ifconfig.me' in url: + return response.text + return response.json().get('ip', response.text) \ No newline at end of file diff --git a/setup_config.py b/setup_config.py new file mode 100644 index 00000000..02315ed2 --- /dev/null +++ b/setup_config.py @@ -0,0 +1,46 @@ +""" +Helper script to set up configuration files +""" +import os +import shutil +from getpass import getpass + +def setup_config(): + # Check if config_sensitive.py already exists + if os.path.exists('config_sensitive.py'): + overwrite = input("config_sensitive.py already exists. Overwrite? (yes/no): ") + if overwrite.lower() != 'yes': + print("Setup cancelled.") + return + + # Copy template + shutil.copy2('config_sensitive_template.py', 'config_sensitive.py') + + # Get proxy configuration + use_proxy = input("Do you want to use proxies? (yes/no): ").lower() == 'yes' + + if use_proxy: + proxy_url = input("Enter proxy URL (format: http://host:port): ") + username = input("Proxy username: ") + password = getpass("Proxy password: ") + + # Create proxy string + proxy = f"http://{username}:{password}@{proxy_url.split('//')[1]}" + + # Update config file + with open('config_sensitive.py', 'r') as f: + content = f.read() + + content = content.replace( + '"http://your-username:your-password@your-proxy-host:port"', + f'"{proxy}"' + ) + + with open('config_sensitive.py', 'w') as f: + f.write(content) + + print("\nConfiguration file created successfully!") + print("Remember to add config_sensitive.py to .gitignore") + +if __name__ == "__main__": + setup_config() \ No newline at end of file