From a5916edcddf833419898dcdc30180e91fcdad947 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 3 Jan 2024 12:04:32 -0600 Subject: [PATCH 1/2] fix(glassdoor): add retry adapter (#77) --- src/jobspy/scrapers/glassdoor/__init__.py | 8 ++++---- src/jobspy/scrapers/utils.py | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 57fff5f3..706b3e75 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -26,7 +26,7 @@ def __init__(self, proxy: Optional[str] = None): """ Initializes GlassdoorScraper with the Glassdoor job search url """ - site = Site(Site.ZIP_RECRUITER) + site = Site(Site.GLASSDOOR) super().__init__(site, proxy=proxy) self.url = None @@ -49,7 +49,7 @@ def fetch_jobs_page( payload = self.add_payload( scraper_input, location_id, location_type, page_num, cursor ) - session = create_session(self.proxy, is_tls=False) + session = create_session(self.proxy, is_tls=False, has_retry=True) response = session.post( f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload ) @@ -171,7 +171,7 @@ def get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - session = create_session(self.proxy) + session = create_session(self.proxy, has_retry=True) response = session.get(url) if response.status_code != 200: raise GlassdoorException( @@ -194,7 +194,7 @@ def add_payload( location_type: str, page_num: int, cursor: str | None = None, - ) -> dict[str, str | Any]: + ) -> str: payload = { "operationName": "JobSearchResultsQuery", "variables": { diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 6ca635be..9b38c0ed 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,8 +1,10 @@ import re import numpy as np -import requests import tls_client +import requests +from requests.adapters import HTTPAdapter, Retry + from ..jobs import JobType @@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None: return email_regex.findall(text) -def create_session(proxy: dict | None = None, is_tls: bool = True): +def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False): """ - Creates a tls client session + Creates a requests session with optional tls, proxy, and retry settings. - :return: A session object with or without proxies. + :return: A session object """ if is_tls: session = tls_client.Session( @@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True): session.allow_redirects = True if proxy: session.proxies.update(proxy) + if has_retry: + retries = Retry(total=3, + connect=3, + status=3, + status_forcelist=[500, 502, 503, 504, 429], + backoff_factor=1) + adapter = HTTPAdapter(max_retries=retries) + + session.mount('http://', adapter) + session.mount('https://', adapter) return session From aeb93b99f511ca18e96c88a863c89f56ab57608b Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 3 Jan 2024 12:04:50 -0600 Subject: [PATCH 2/2] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 89587e73..9a49bf79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.30" +version = "1.1.31" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy"