From c85f2c68a045b88db863f44935d61cd4791127c8 Mon Sep 17 00:00:00 2001 From: Cullen Date: Thu, 29 Feb 2024 21:28:24 -0600 Subject: [PATCH] fix(indeed): return on error --- poetry.lock | 20 +++++++++++++++----- pyproject.toml | 4 ++-- src/jobspy/__init__.py | 2 +- src/jobspy/scrapers/indeed/__init__.py | 14 ++++++++------ 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/poetry.lock b/poetry.lock index d4581f9d..20eb44d3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -1064,6 +1064,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2271,13 +2281,13 @@ test = ["flake8", "isort", "pytest"] [[package]] name = "tls-client" -version = "1.0" +version = "1.0.1" description = "Advanced Python HTTP Client." optional = false python-versions = "*" files = [ - {file = "tls_client-1.0-py3-none-any.whl", hash = "sha256:f1183f5e18cb31914bd62d11b350a33ea0293ea80fb91d69a3072821dece3e66"}, - {file = "tls_client-1.0.tar.gz", hash = "sha256:7f6de48ad4a0ef69b72682c76ce604155971e07b4bfb2148a36276194ae3e7a0"}, + {file = "tls_client-1.0.1-py3-none-any.whl", hash = "sha256:2f8915c0642c2226c9e33120072a2af082812f6310d32f4ea4da322db7d3bb1c"}, + {file = "tls_client-1.0.1.tar.gz", hash = "sha256:dad797f3412bb713606e0765d489f547ffb580c5ffdb74aed47a183ce8505ff5"}, ] [[package]] @@ -2446,4 +2456,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0" +content-hash = "eea3694820df164179cdd8312d382eb5b29d6317c4d34c586e8866c69aaee9e9" diff --git a/pyproject.toml b/pyproject.toml index 8fd7ba76..42dcf964 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.45" +version = "1.1.46" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" @@ -13,12 +13,12 @@ packages = [ [tool.poetry.dependencies] python = "^3.10" requests = "^2.31.0" -tls-client = "*" beautifulsoup4 = "^4.12.2" pandas = "^2.1.0" NUMPY = "1.24.2" pydantic = "^2.3.0" html2text = "^2020.1.16" +tls-client = "^1.0.1" [tool.poetry.group.dev.dependencies] diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 9aa3e9f1..866c662c 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -160,7 +160,7 @@ def worker(site): # Desired column order desired_order = [ - "job_url_hyper" if 'hyperlinks' in locals() or 'hyperlinks' in globals() else "job_url", + "job_url_hyper" if hyperlinks else "job_url", "site", "title", "company", diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 27c3d34b..acff351b 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -82,7 +82,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: if not new_jobs: break - if len(self.seen_urls) > scraper_input.results_wanted: job_list = job_list[:scraper_input.results_wanted] @@ -124,12 +123,15 @@ def _scrape_page(self, page: int=0) -> list[JobPost]: return job_list jobs = IndeedScraper._parse_jobs(soup) + if not jobs: + return [] if ( not jobs.get("metaData", {}) .get("mosaicProviderJobCardsModel", {}) .get("results") ): - raise IndeedException("No jobs found.") + logger.error("Indeed - No jobs found.") + return [] jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] job_keys = [job['jobkey'] for job in jobs] @@ -302,11 +304,11 @@ def find_mosaic_script() -> Tag | None: jobs = json.loads(m.group(1).strip()) return jobs else: - raise IndeedException("Could not find mosaic provider job cards data") + logger.warning(f'Indeed: Could not find mosaic provider job cards data') + return {} else: - raise IndeedException( - "Could not find any results for the search" - ) + logger.warning(f"Indeed: Could not parse any jobs on the page") + return {} @staticmethod def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool: