cullenwatson · KeyArgo · Feb 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,37 @@
-/venv/
-/.idea
-**/__pycache__/
-**/.pytest_cache/
-/.ipynb_checkpoints/
-**/output/
-**/.DS_Store
-*.pyc
-.env
-dist
+# Sensitive configuration
+config_sensitive.py
+
+# Generated files
+*.csv
+*.log
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
diff --git a/config.py b/config.py
@@ -0,0 +1,23 @@
+# Default configuration that can be committed
+DEFAULT_CONFIG = {
+    'search_term': 'IT Engineer',
+    'location': 'Lone Tree, CO',
+    'distance': 25,
+    'results_wanted': 50,
+    'job_type': 'fulltime',
+    'hours_old': 72,
+    'search_sites': ["indeed", "linkedin"],
+    'exclude_clearance': True,
+    'clearance_keywords': [
+        'clearance', 'security clearance', 'secret', 'top secret', 
+        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+        'public trust', 'security+', 'security plus'
+    ]
+}
+
+
+try:
+    # Try to import sensitive config from a local file
+    from .config_sensitive import SENSITIVE_CONFIG
+except ImportError:
+    print("Warning: No sensitive configuration found. Using defaults.") 
diff --git a/config_sensitive_template.py b/config_sensitive_template.py
@@ -0,0 +1,49 @@
+"""
+JobSpy Sensitive Configuration Template
+=====================================
+
+Setup Instructions:
+1. Copy this file to 'config_sensitive.py'
+2. Fill in your actual values
+3. Keep config_sensitive.py in .gitignore
+
+Security Best Practices:
+- Never commit config_sensitive.py to version control
+- Store proxy credentials securely
+- Rotate credentials regularly
+- Use environment variables when possible
+"""
+
+SENSITIVE_CONFIG = {
+    'proxy_enabled': True,  # Set to False to disable proxy usage
+
+    # Add your proxy URLs here (at least one required if proxy_enabled is True)
+    'proxy_list': [
+        "http://your-username:your-password@your-proxy-host:port",
+        "http://your-backup-proxy-url:port"  # Optional backup proxy
+    ],
+
+    # IP verification services (can be customized)
+    'proxy_verification_urls': [
+        'http://api.ipify.org?format=json',
+        'http://ip-api.com/json',
+        'http://ifconfig.me/ip'
+    ],
+
+    # Advanced Settings
+    'proxy_timeout': 10,        # Seconds to wait for proxy response
+    'max_retries': 3,          # Maximum retry attempts per proxy
+    'rotate_interval': 100,    # Rotate proxy after N requests
+    'verify_ssl': False        # Disable for some proxy configurations
+}
+
+"""
+Example format for proxy_list entries:
+- Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225"
+- Generic format: "http://username:password@host:port"
+
+Security Notes:
+1. Never commit config_sensitive.py to version control
+2. Keep your proxy credentials secure
+3. Regularly rotate proxy credentials if possible
+""" 
diff --git a/job_search.py b/job_search.py
@@ -0,0 +1,109 @@
+from jobspy import scrape_jobs
+import time
+import certifi
+import pandas as pd
+import csv
+from datetime import datetime
+from typing import Optional, List
+
+def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
+    """Filter out jobs requiring security clearance"""
+    clearance_keywords = [
+        'clearance', 'security clearance', 'secret', 'top secret', 
+        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+        'public trust', 'security+', 'security plus'
+    ]
+
+    # Create a pattern matching any clearance keyword
+    pattern = '|'.join(clearance_keywords)
+
+    # Filter out jobs where title or description contains clearance keywords
+    mask = ~(
+        df['title'].str.lower().str.contains(pattern, na=False) |
+        df['description'].str.lower().str.contains(pattern, na=False)
+    )
+
+    return df[mask]
+
+def search_tech_jobs(
+    search_sites: List[str] = ["indeed", "linkedin"],
+    exclude_clearance: bool = False
+) -> Optional[pd.DataFrame]:
+
+    # Search configuration
+    search_config = {
+        'search_term': 'IT Engineer',
+        'location': 'Lone Tree, CO',
+        'distance': 25,
+        'results_wanted': 50,
+        'job_type': 'fulltime',
+        'hours_old': 72
+    }
+
+    try:
+        print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
+        print(f"Distance: {search_config['distance']} miles")
+        print(f"Job Type: {search_config['job_type']}")
+        print(f"Posts from last: {search_config['hours_old']} hours")
+        print(f"Excluding clearance jobs: {exclude_clearance}")
+        print(f"Searching on: {', '.join(search_sites)}")
+
+        jobs = scrape_jobs(
+            site_name=search_sites,
+            search_term=search_config['search_term'],
+            location=search_config['location'],
+            distance=search_config['distance'],
+            results_wanted=search_config['results_wanted'],
+            job_type=search_config['job_type'],
+            hours_old=search_config['hours_old'],
+            country_indeed="USA",
+            description_format="markdown",
+            verbose=2
+        )
+
+        if isinstance(jobs, pd.DataFrame) and not jobs.empty:
+            print(f"\nInitial jobs found: {len(jobs)}")
+
+            if exclude_clearance:
+                original_count = len(jobs)
+                jobs = filter_clearance_jobs(jobs)
+                filtered_count = len(jobs)
+                print(f"Removed {original_count - filtered_count} jobs requiring clearance")
+
+            # Save results
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            csv_filename = f"it_jobs_{timestamp}.csv"
+
+            # Print job summary
+            print("\nJob Listings Found:")
+            print("-------------------")
+            for idx, job in jobs.iterrows():
+                print(f"\n{idx + 1}. {job.get('title', 'No title')}")
+                print(f"   Company: {job.get('company', 'No company')}")
+                print(f"   Location: {job.get('location', 'No location')}")
+                print(f"   Source: {job.get('site', 'No source')}")
+                print(f"   Date Posted: {job.get('date_posted', 'No date')}")
+
+            jobs.to_csv(csv_filename, index=False)
+            print(f"\nResults saved to: {csv_filename}")
+            return jobs
+
+        print("No jobs found with current search parameters.")
+        return None
+
+    except Exception as e:
+        print(f"\nError during search:")
+        print(f"Error details: {str(e)}")
+        return None
+
+if __name__ == "__main__":
+    print("Starting job search...")
+    jobs = search_tech_jobs(exclude_clearance=True)
+
+    if jobs is not None and not jobs.empty:
+        print("\nSearch completed successfully!")
+        print(f"Total jobs found: {len(jobs)}")
+        print("\nJobs by source:")
+        print(jobs['site'].value_counts())
+    else:
+        print("\nNo results found. Try adjusting search parameters.")