From 3c1994216cf5fb9f66802537840f61a70e4781d8 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 3 Jan 2024 12:02:56 -0600 Subject: [PATCH] fix(glassdoor): retry --- src/jobspy/scrapers/glassdoor/__init__.py | 8 ++++---- src/jobspy/scrapers/utils.py | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 57fff5f3..706b3e75 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -26,7 +26,7 @@ def __init__(self, proxy: Optional[str] = None): """ Initializes GlassdoorScraper with the Glassdoor job search url """ - site = Site(Site.ZIP_RECRUITER) + site = Site(Site.GLASSDOOR) super().__init__(site, proxy=proxy) self.url = None @@ -49,7 +49,7 @@ def fetch_jobs_page( payload = self.add_payload( scraper_input, location_id, location_type, page_num, cursor ) - session = create_session(self.proxy, is_tls=False) + session = create_session(self.proxy, is_tls=False, has_retry=True) response = session.post( f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload ) @@ -171,7 +171,7 @@ def get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - session = create_session(self.proxy) + session = create_session(self.proxy, has_retry=True) response = session.get(url) if response.status_code != 200: raise GlassdoorException( @@ -194,7 +194,7 @@ def add_payload( location_type: str, page_num: int, cursor: str | None = None, - ) -> dict[str, str | Any]: + ) -> str: payload = { "operationName": "JobSearchResultsQuery", "variables": { diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 6ca635be..9b38c0ed 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,8 +1,10 @@ import re import numpy as np -import requests import tls_client +import requests +from requests.adapters import HTTPAdapter, Retry + from ..jobs import JobType @@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None: return email_regex.findall(text) -def create_session(proxy: dict | None = None, is_tls: bool = True): +def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False): """ - Creates a tls client session + Creates a requests session with optional tls, proxy, and retry settings. - :return: A session object with or without proxies. + :return: A session object """ if is_tls: session = tls_client.Session( @@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True): session.allow_redirects = True if proxy: session.proxies.update(proxy) + if has_retry: + retries = Retry(total=3, + connect=3, + status=3, + status_forcelist=[500, 502, 503, 504, 429], + backoff_factor=1) + adapter = HTTPAdapter(max_retries=retries) + + session.mount('http://', adapter) + session.mount('https://', adapter) return session