forked from Eben001/IndeedJobScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob_scraper_utils.py
96 lines (79 loc) · 3.94 KB
/
job_scraper_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pandas as pd
import time
from bs4 import BeautifulSoup
from seleniumbase import Driver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
def configure_webdriver():
# SeleniumBase provides a simplified way to create a driver with built-in stealth
driver = Driver(uc=True, # Uses undetected-chromedriver
headless=True,
agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
return driver
def search_jobs(driver, country, job_position, job_location, date_posted):
full_url = f'{country}/jobs?q={"+".join(job_position.split())}&l={job_location}&fromage={date_posted}'
print(full_url)
driver.get(full_url)
driver.sleep(0.1) # SeleniumBase's smart wait
try:
# Using SeleniumBase's improved element finding
job_count_element = driver.find_element(
By.CSS_SELECTOR,
'div[class*="jobsearch-JobCountAndSortPane-jobCount"] span'
)
total_jobs = job_count_element.text
print(f"{total_jobs} found")
except NoSuchElementException:
print("No job count found")
total_jobs = "Unknown"
return job_position, total_jobs
def scrape_job_data(driver, country, job_position, total_jobs):
df = pd.DataFrame(columns=['Link', 'Job Title', 'Company', 'Location', 'Job Description', 'Salary', 'Search Query'])
job_count = 0
while True:
# Wait for job listings to be visible
driver.wait_for_element_present('div.job_seen_beacon')
soup = BeautifulSoup(driver.get_page_source(), 'lxml')
boxes = soup.find_all('div', class_='job_seen_beacon')
for box in boxes:
link = box.find('a').get('href')
link_full = country + link
job_title = box.select_one('h2.jobTitle').text.strip()
company_tag = box.find('span', {'data-testid': 'company-name'})
company = company_tag.text if company_tag else None
location_element = box.find('div', {'data-testid': 'text-location'})
location = location_element.find('span').text if location_element and location_element.find('span') else location_element.text if location_element else ''
# Navigate to job details page
driver.get(link_full)
driver.wait_for_element_present('#jobDescriptionText')
soup_job_page = BeautifulSoup(driver.get_page_source(), 'lxml')
job_description_element = soup_job_page.find('div', id='jobDescriptionText')
job_description_text = job_description_element.get_text(strip=True) if job_description_element else "Unknown"
salary_element = soup_job_page.find('div', id='salaryInfoAndJobType')
salary_text = 'Unknown'
if salary_element:
spans = salary_element.find_all('span')
salary_text = ' '.join([span.get_text(strip=True) for span in spans]) if spans else salary_element.text.strip()
new_data = pd.DataFrame({
'Link': [link_full],
'Job Title': [job_title],
'Company': [company],
'Location': [location],
'Job Description': [job_description_text],
'Salary': [salary_text],
'Search Query': [job_position]
})
df = pd.concat([df, new_data], ignore_index=True)
job_count += 1
print(f"Scraped {job_count} of {total_jobs}")
next_page = soup.find('a', {'aria-label': 'Next Page'})
if next_page:
next_page_url = country + next_page.get('href')
driver.get(next_page_url)
driver.sleep(0.1) # Allow page to load
else:
break
return df
def sort_data(df):
return df[['Link', 'Job Title', 'Company', 'Location', 'Job Description', 'Salary', 'Search Query']]