Skip to content

Commit

Permalink
fix(indeed): return no jobs instead of error
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Feb 9, 2024
1 parent 2b72381 commit ccd1859
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 20 deletions.
5 changes: 2 additions & 3 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from typing import Tuple
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
Expand Down Expand Up @@ -119,7 +118,7 @@ def worker(site):
executor.submit(worker, site): site for site in scraper_input.site_type
}

for future in concurrent.futures.as_completed(future_to_site):
for future in as_completed(future_to_site):
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data

Expand Down
43 changes: 26 additions & 17 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,14 @@ def scrape_page(
raise IndeedException(str(e))

soup = BeautifulSoup(response.content, "html.parser")
job_list = []
total_num_jobs = IndeedScraper.total_jobs(soup)
if "did not match any jobs" in response.text:
raise IndeedException("Parsing exception: Search did not match any jobs")
return job_list, total_num_jobs

jobs = IndeedScraper.parse_jobs(
soup
) #: can raise exception, handled by main scrape function
total_num_jobs = IndeedScraper.total_jobs(soup)

if (
not jobs.get("metaData", {})
Expand Down Expand Up @@ -152,26 +153,34 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
:param scraper_input:
:return: job_response
"""
pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
)

#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 0)
pages_processed = 1

with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page)
for page in range(1, pages_to_process + 1)
]
while len(self.seen_urls) < scraper_input.results_wanted:
pages_to_process = math.ceil((scraper_input.results_wanted - len(self.seen_urls)) / self.jobs_per_page)
new_jobs = False

with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page + pages_processed)
for page in range(pages_to_process)
]

for future in futures:
jobs, _ = future.result()
if jobs:
job_list += jobs
new_jobs = True
if len(self.seen_urls) >= scraper_input.results_wanted:
break

for future in futures:
jobs, _ = future.result()
pages_processed += pages_to_process
if not new_jobs:
break

job_list += jobs

if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]

job_response = JobResponse(
jobs=job_list,
Expand Down

0 comments on commit ccd1859

Please sign in to comment.