Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add complete job search functionality with proxy support and security… #240

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,37 @@
/venv/
/.idea
**/__pycache__/
**/.pytest_cache/
/.ipynb_checkpoints/
**/output/
**/.DS_Store
*.pyc
.env
dist
# Sensitive configuration
config_sensitive.py

# Generated files
*.csv
*.log

# Python
__pycache__/
*.py[cod]
*$py.class
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Virtual Environment
venv/
ENV/

# IDE
.idea/
.vscode/
*.swp
*.swo
23 changes: 23 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Default configuration that can be committed
DEFAULT_CONFIG = {
'search_term': 'IT Engineer',
'location': 'Lone Tree, CO',
'distance': 25,
'results_wanted': 50,
'job_type': 'fulltime',
'hours_old': 72,
'search_sites': ["indeed", "linkedin"],
'exclude_clearance': True,
'clearance_keywords': [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
]
}


try:
# Try to import sensitive config from a local file
from .config_sensitive import SENSITIVE_CONFIG
except ImportError:
print("Warning: No sensitive configuration found. Using defaults.")
49 changes: 49 additions & 0 deletions config_sensitive_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
JobSpy Sensitive Configuration Template
=====================================

Setup Instructions:
1. Copy this file to 'config_sensitive.py'
2. Fill in your actual values
3. Keep config_sensitive.py in .gitignore

Security Best Practices:
- Never commit config_sensitive.py to version control
- Store proxy credentials securely
- Rotate credentials regularly
- Use environment variables when possible
"""

SENSITIVE_CONFIG = {
'proxy_enabled': True, # Set to False to disable proxy usage

# Add your proxy URLs here (at least one required if proxy_enabled is True)
'proxy_list': [
"http://your-username:your-password@your-proxy-host:port",
"http://your-backup-proxy-url:port" # Optional backup proxy
],

# IP verification services (can be customized)
'proxy_verification_urls': [
'http://api.ipify.org?format=json',
'http://ip-api.com/json',
'http://ifconfig.me/ip'
],

# Advanced Settings
'proxy_timeout': 10, # Seconds to wait for proxy response
'max_retries': 3, # Maximum retry attempts per proxy
'rotate_interval': 100, # Rotate proxy after N requests
'verify_ssl': False # Disable for some proxy configurations
}

"""
Example format for proxy_list entries:
- Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225"
- Generic format: "http://username:password@host:port"

Security Notes:
1. Never commit config_sensitive.py to version control
2. Keep your proxy credentials secure
3. Regularly rotate proxy credentials if possible
"""
109 changes: 109 additions & 0 deletions job_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from jobspy import scrape_jobs
import time
import certifi
import pandas as pd
import csv
from datetime import datetime
from typing import Optional, List

def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
"""Filter out jobs requiring security clearance"""
clearance_keywords = [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
]

# Create a pattern matching any clearance keyword
pattern = '|'.join(clearance_keywords)

# Filter out jobs where title or description contains clearance keywords
mask = ~(
df['title'].str.lower().str.contains(pattern, na=False) |
df['description'].str.lower().str.contains(pattern, na=False)
)

return df[mask]

def search_tech_jobs(
search_sites: List[str] = ["indeed", "linkedin"],
exclude_clearance: bool = False
) -> Optional[pd.DataFrame]:

# Search configuration
search_config = {
'search_term': 'IT Engineer',
'location': 'Lone Tree, CO',
'distance': 25,
'results_wanted': 50,
'job_type': 'fulltime',
'hours_old': 72
}

try:
print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
print(f"Distance: {search_config['distance']} miles")
print(f"Job Type: {search_config['job_type']}")
print(f"Posts from last: {search_config['hours_old']} hours")
print(f"Excluding clearance jobs: {exclude_clearance}")
print(f"Searching on: {', '.join(search_sites)}")

jobs = scrape_jobs(
site_name=search_sites,
search_term=search_config['search_term'],
location=search_config['location'],
distance=search_config['distance'],
results_wanted=search_config['results_wanted'],
job_type=search_config['job_type'],
hours_old=search_config['hours_old'],
country_indeed="USA",
description_format="markdown",
verbose=2
)

if isinstance(jobs, pd.DataFrame) and not jobs.empty:
print(f"\nInitial jobs found: {len(jobs)}")

if exclude_clearance:
original_count = len(jobs)
jobs = filter_clearance_jobs(jobs)
filtered_count = len(jobs)
print(f"Removed {original_count - filtered_count} jobs requiring clearance")

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"it_jobs_{timestamp}.csv"

# Print job summary
print("\nJob Listings Found:")
print("-------------------")
for idx, job in jobs.iterrows():
print(f"\n{idx + 1}. {job.get('title', 'No title')}")
print(f" Company: {job.get('company', 'No company')}")
print(f" Location: {job.get('location', 'No location')}")
print(f" Source: {job.get('site', 'No source')}")
print(f" Date Posted: {job.get('date_posted', 'No date')}")

jobs.to_csv(csv_filename, index=False)
print(f"\nResults saved to: {csv_filename}")
return jobs

print("No jobs found with current search parameters.")
return None

except Exception as e:
print(f"\nError during search:")
print(f"Error details: {str(e)}")
return None

if __name__ == "__main__":
print("Starting job search...")
jobs = search_tech_jobs(exclude_clearance=True)

if jobs is not None and not jobs.empty:
print("\nSearch completed successfully!")
print(f"Total jobs found: {len(jobs)}")
print("\nJobs by source:")
print(jobs['site'].value_counts())
else:
print("\nNo results found. Try adjusting search parameters.")
Loading