From 94d8f555fd770103f50baf4bc76db32f646bcb29 Mon Sep 17 00:00:00 2001
From: VitaminB16 <38860569+VitaminB16@users.noreply.github.com>
Date: Mon, 11 Mar 2024 04:36:27 +0000
Subject: [PATCH] format: Apply Black formatter to the codebase (#127)
---
.pre-commit-config.yaml | 7 +
poetry.lock | 204 ++++++++++++++++++-
pyproject.toml | 5 +
src/jobspy/__init__.py | 28 +--
src/jobspy/jobs/__init__.py | 9 +-
src/jobspy/scrapers/__init__.py | 4 +-
src/jobspy/scrapers/glassdoor/__init__.py | 105 ++++++----
src/jobspy/scrapers/indeed/__init__.py | 187 ++++++++++-------
src/jobspy/scrapers/linkedin/__init__.py | 67 +++---
src/jobspy/scrapers/utils.py | 45 ++--
src/jobspy/scrapers/ziprecruiter/__init__.py | 71 +++----
11 files changed, 524 insertions(+), 208 deletions(-)
create mode 100644 .pre-commit-config.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..57074a0d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+- repo: https://github.com/psf/black
+ rev: 24.2.0
+ hooks:
+ - id: black
+ language_version: python
+ args: [--line-length=88, --quiet]
diff --git a/poetry.lock b/poetry.lock
index c129a67a..0e5f789d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -203,6 +203,52 @@ soupsieve = ">1.2"
html5lib = ["html5lib"]
lxml = ["lxml"]
+[[package]]
+name = "black"
+version = "24.2.0"
+description = "The uncompromising code formatter."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "black-24.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6981eae48b3b33399c8757036c7f5d48a535b962a7c2310d19361edeef64ce29"},
+ {file = "black-24.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d533d5e3259720fdbc1b37444491b024003e012c5173f7d06825a77508085430"},
+ {file = "black-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61a0391772490ddfb8a693c067df1ef5227257e72b0e4108482b8d41b5aee13f"},
+ {file = "black-24.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:992e451b04667116680cb88f63449267c13e1ad134f30087dec8527242e9862a"},
+ {file = "black-24.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:163baf4ef40e6897a2a9b83890e59141cc8c2a98f2dda5080dc15c00ee1e62cd"},
+ {file = "black-24.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e37c99f89929af50ffaf912454b3e3b47fd64109659026b678c091a4cd450fb2"},
+ {file = "black-24.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9de21bafcba9683853f6c96c2d515e364aee631b178eaa5145fc1c61a3cc92"},
+ {file = "black-24.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:9db528bccb9e8e20c08e716b3b09c6bdd64da0dd129b11e160bf082d4642ac23"},
+ {file = "black-24.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d84f29eb3ee44859052073b7636533ec995bd0f64e2fb43aeceefc70090e752b"},
+ {file = "black-24.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e08fb9a15c914b81dd734ddd7fb10513016e5ce7e6704bdd5e1251ceee51ac9"},
+ {file = "black-24.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:810d445ae6069ce64030c78ff6127cd9cd178a9ac3361435708b907d8a04c693"},
+ {file = "black-24.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ba15742a13de85e9b8f3239c8f807723991fbfae24bad92d34a2b12e81904982"},
+ {file = "black-24.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e53a8c630f71db01b28cd9602a1ada68c937cbf2c333e6ed041390d6968faf4"},
+ {file = "black-24.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:93601c2deb321b4bad8f95df408e3fb3943d85012dddb6121336b8e24a0d1218"},
+ {file = "black-24.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0057f800de6acc4407fe75bb147b0c2b5cbb7c3ed110d3e5999cd01184d53b0"},
+ {file = "black-24.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:faf2ee02e6612577ba0181f4347bcbcf591eb122f7841ae5ba233d12c39dcb4d"},
+ {file = "black-24.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:057c3dc602eaa6fdc451069bd027a1b2635028b575a6c3acfd63193ced20d9c8"},
+ {file = "black-24.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:08654d0797e65f2423f850fc8e16a0ce50925f9337fb4a4a176a7aa4026e63f8"},
+ {file = "black-24.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca610d29415ee1a30a3f30fab7a8f4144e9d34c89a235d81292a1edb2b55f540"},
+ {file = "black-24.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:4dd76e9468d5536abd40ffbc7a247f83b2324f0c050556d9c371c2b9a9a95e31"},
+ {file = "black-24.2.0-py3-none-any.whl", hash = "sha256:e8a6ae970537e67830776488bca52000eaa37fa63b9988e8c487458d9cd5ace6"},
+ {file = "black-24.2.0.tar.gz", hash = "sha256:bce4f25c27c3435e4dace4815bcb2008b87e167e3bf4ee47ccdc5ce906eb4894"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
+pathspec = ">=0.9.0"
+platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+colorama = ["colorama (>=0.4.3)"]
+d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
+jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
+uvloop = ["uvloop (>=0.15.2)"]
+
[[package]]
name = "bleach"
version = "6.0.0"
@@ -308,6 +354,17 @@ files = [
[package.dependencies]
pycparser = "*"
+[[package]]
+name = "cfgv"
+version = "3.4.0"
+description = "Validate configuration and produce human readable error messages."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+ {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
+]
+
[[package]]
name = "charset-normalizer"
version = "3.2.0"
@@ -392,6 +449,20 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
]
+[[package]]
+name = "click"
+version = "8.1.7"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+ {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
[[package]]
name = "colorama"
version = "0.4.6"
@@ -471,6 +542,17 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
+[[package]]
+name = "distlib"
+version = "0.3.8"
+description = "Distribution utilities"
+optional = false
+python-versions = "*"
+files = [
+ {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
+ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
+]
+
[[package]]
name = "exceptiongroup"
version = "1.1.3"
@@ -513,6 +595,22 @@ files = [
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
+[[package]]
+name = "filelock"
+version = "3.13.1"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
+ {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
+]
+
+[package.extras]
+docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+typing = ["typing-extensions (>=4.8)"]
+
[[package]]
name = "fqdn"
version = "1.5.1"
@@ -524,6 +622,20 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
]
+[[package]]
+name = "identify"
+version = "2.5.35"
+description = "File identification library for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "identify-2.5.35-py2.py3-none-any.whl", hash = "sha256:c4de0081837b211594f8e877a6b4fad7ca32bbfc1a9307fdd61c28bfe923f13e"},
+ {file = "identify-2.5.35.tar.gz", hash = "sha256:10a7ca245cfcd756a554a7288159f72ff105ad233c7c4b9c6f0f4d108f5f6791"},
+]
+
+[package.extras]
+license = ["ukkonen"]
+
[[package]]
name = "idna"
version = "3.4"
@@ -1125,6 +1237,17 @@ files = [
{file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"},
]
+[[package]]
+name = "mypy-extensions"
+version = "1.0.0"
+description = "Type system extensions for programs checked with the mypy type checker."
+optional = false
+python-versions = ">=3.5"
+files = [
+ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
+ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
+]
+
[[package]]
name = "nbclient"
version = "0.8.0"
@@ -1216,6 +1339,20 @@ files = [
{file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"},
]
+[[package]]
+name = "nodeenv"
+version = "1.8.0"
+description = "Node.js virtual environment builder"
+optional = false
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
+files = [
+ {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"},
+ {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"},
+]
+
+[package.dependencies]
+setuptools = "*"
+
[[package]]
name = "notebook"
version = "7.0.3"
@@ -1402,6 +1539,17 @@ files = [
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
testing = ["docopt", "pytest (<6.0.0)"]
+[[package]]
+name = "pathspec"
+version = "0.12.1"
+description = "Utility library for gitignore style pattern matching of file paths."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
+ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
+]
+
[[package]]
name = "pexpect"
version = "4.8.0"
@@ -1457,6 +1605,24 @@ files = [
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
+[[package]]
+name = "pre-commit"
+version = "3.6.2"
+description = "A framework for managing and maintaining multi-language pre-commit hooks."
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "pre_commit-3.6.2-py2.py3-none-any.whl", hash = "sha256:ba637c2d7a670c10daedc059f5c49b5bd0aadbccfcd7ec15592cf9665117532c"},
+ {file = "pre_commit-3.6.2.tar.gz", hash = "sha256:c3ef34f463045c88658c5b99f38c1e297abdcc0ff13f98d3370055fbbfabc67e"},
+]
+
+[package.dependencies]
+cfgv = ">=2.0.0"
+identify = ">=1.0.0"
+nodeenv = ">=0.11.1"
+pyyaml = ">=5.1"
+virtualenv = ">=20.10.0"
+
[[package]]
name = "prometheus-client"
version = "0.17.1"
@@ -2183,6 +2349,22 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"]
objc = ["pyobjc-framework-Cocoa"]
win32 = ["pywin32"]
+[[package]]
+name = "setuptools"
+version = "69.1.1"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
+ {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
+]
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
[[package]]
name = "six"
version = "1.16.0"
@@ -2383,6 +2565,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
zstd = ["zstandard (>=0.18.0)"]
+[[package]]
+name = "virtualenv"
+version = "20.25.1"
+description = "Virtual Python Environment builder"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "virtualenv-20.25.1-py3-none-any.whl", hash = "sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a"},
+ {file = "virtualenv-20.25.1.tar.gz", hash = "sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197"},
+]
+
+[package.dependencies]
+distlib = ">=0.3.7,<1"
+filelock = ">=3.12.2,<4"
+platformdirs = ">=3.9.1,<5"
+
+[package.extras]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
+
[[package]]
name = "wcwidth"
version = "0.2.6"
@@ -2450,4 +2652,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
-content-hash = "ba7f7cc9b6833a4a6271981f90610395639dd8b9b3db1370cbd1149d70cc9632"
+content-hash = "6ee18819a726314f61f20f0ed93b2db2a26c232269f045146d9a8f4e3f31eb01"
diff --git a/pyproject.toml b/pyproject.toml
index a85896eb..c71fe655 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,12 @@ markdownify = "^0.11.6"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
+black = "^24.2.0"
+pre-commit = "^3.6.2"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
+
+[tool.black]
+line-length = 88
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
index db280985..07b5b274 100644
--- a/src/jobspy/__init__.py
+++ b/src/jobspy/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -70,6 +72,7 @@ def get_site_type():
for site in site_name
]
return site_types
+
country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput(
@@ -86,14 +89,15 @@ def get_site_type():
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
- hours_old=hours_old
+ hours_old=hours_old,
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
scraped_data: JobResponse = scraper.scrape(scraper_input)
- site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
+ cap_name = site.value.capitalize()
+ site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
logger.info(f"{site_name} finished scraping")
return site.value, scraped_data
@@ -117,9 +121,8 @@ def worker(site):
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
job_data = job.dict()
- job_data[
- "job_url_hyper"
- ] = f'{job_data["job_url"]}'
+ job_url = job_data["job_url"]
+ job_data["job_url_hyper"] = f'{job_url}'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
@@ -156,11 +159,11 @@ def worker(site):
if jobs_dfs:
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
- filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
-
+ filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
+
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
-
+
# Desired column order
desired_order = [
"site",
@@ -178,7 +181,6 @@ def worker(site):
"is_remote",
"emails",
"description",
-
"company_url",
"company_url_direct",
"company_addresses",
@@ -191,16 +193,16 @@ def worker(site):
"ceo_name",
"ceo_photo_url",
]
-
+
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty
-
+
# Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order]
-
+
# Step 4: Sort the DataFrame as required
- return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
+ return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
else:
return pd.DataFrame()
diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
index 7ccab45a..31cbce92 100644
--- a/src/jobspy/jobs/__init__.py
+++ b/src/jobspy/jobs/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from typing import Optional
from datetime import date
from enum import Enum
@@ -156,7 +158,7 @@ def from_string(cls, country_str: str):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
- country_names = country.value[0].split(',')
+ country_names = country.value[0].split(",")
if country_str in country_names:
return country
valid_countries = [country.value for country in cls]
@@ -178,7 +180,10 @@ def display_location(self) -> str:
location_parts.append(self.state)
if isinstance(self.country, str):
location_parts.append(self.country)
- elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
+ elif self.country and self.country not in (
+ Country.US_CANADA,
+ Country.WORLDWIDE,
+ ):
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
index 0c142522..d0233c70 100644
--- a/src/jobspy/scrapers/__init__.py
+++ b/src/jobspy/scrapers/__init__.py
@@ -1,10 +1,12 @@
+from __future__ import annotations
+
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
- DescriptionFormat
+ DescriptionFormat,
)
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
index 0483c3f5..0d85aa62 100644
--- a/src/jobspy/scrapers/glassdoor/__init__.py
+++ b/src/jobspy/scrapers/glassdoor/__init__.py
@@ -4,21 +4,23 @@
This module contains routines to scrape Glassdoor.
"""
-import json
-import re
+from __future__ import annotations
+
+import re
+import json
import requests
-from typing import Optional
+from typing import Optional, Tuple
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
-from ..utils import extract_emails_from_text
from .. import Scraper, ScraperInput, Site
+from ..utils import extract_emails_from_text
from ..exceptions import GlassdoorException
from ..utils import (
create_session,
markdown_converter,
- logger
+ logger,
)
from ...jobs import (
JobPost,
@@ -27,7 +29,7 @@
Location,
JobResponse,
JobType,
- DescriptionFormat
+ DescriptionFormat,
)
@@ -59,25 +61,22 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
token = self._get_csrf_token()
- self.headers['gd-csrf-token'] = token if token else self.fallback_token
+ self.headers["gd-csrf-token"] = token if token else self.fallback_token
location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
- logger.error('Glassdoor: location not parsed')
+ logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
all_jobs: list[JobPost] = []
cursor = None
- for page in range(
- 1 + (scraper_input.offset // self.jobs_per_page),
- min(
- (scraper_input.results_wanted // self.jobs_per_page) + 2,
- self.max_pages + 1,
- ),
- ):
- logger.info(f'Glassdoor search page: {page}')
+ range_start = 1 + (scraper_input.offset // self.jobs_per_page)
+ tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
+ range_end = min(tot_pages, self.max_pages + 1)
+ for page in range(range_start, range_end):
+ logger.info(f"Glassdoor search page: {page}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
@@ -87,7 +86,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
- logger.error(f'Glassdoor: {str(e)}')
+ logger.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=all_jobs)
@@ -98,39 +97,48 @@ def _fetch_jobs_page(
location_type: str,
page_num: int,
cursor: str | None,
- ) -> (list[JobPost], str | None):
+ ) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
"""
jobs = []
self.scraper_input = scraper_input
try:
- payload = self._add_payload(
- location_id, location_type, page_num, cursor
- )
+ payload = self._add_payload(location_id, location_type, page_num, cursor)
response = self.session.post(
- f"{self.base_url}/graph", headers=self.headers, timeout_seconds=15, data=payload
+ f"{self.base_url}/graph",
+ headers=self.headers,
+ timeout_seconds=15,
+ data=payload,
)
if response.status_code != 200:
- raise GlassdoorException(f"bad response status code: {response.status_code}")
+ exc_msg = f"bad response status code: {response.status_code}"
+ raise GlassdoorException(exc_msg)
res_json = response.json()[0]
if "errors" in res_json:
raise ValueError("Error encountered in API response")
- except (requests.exceptions.ReadTimeout, GlassdoorException, ValueError, Exception) as e:
- logger.error(f'Glassdoor: {str(e)}')
+ except (
+ requests.exceptions.ReadTimeout,
+ GlassdoorException,
+ ValueError,
+ Exception,
+ ) as e:
+ logger.error(f"Glassdoor: {str(e)}")
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"]
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
- future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
+ future_to_job_data = {
+ executor.submit(self._process_job, job): job for job in jobs_data
+ }
for future in as_completed(future_to_job_data):
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
- raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
+ raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
@@ -140,7 +148,9 @@ def _get_csrf_token(self):
"""
Fetches csrf token needed for API by visiting a generic page
"""
- res = self.session.get(f'{self.base_url}/Job/computer-science-jobs.htm', headers=self.headers)
+ res = self.session.get(
+ f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
+ )
pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text)
token = None
@@ -153,19 +163,20 @@ def _process_job(self, job_data):
Processes a single job and fetches its description.
"""
job_id = job_data["jobview"]["job"]["listingId"]
- job_url = f'{self.base_url}job-listing/j?jl={job_id}'
+ job_url = f"{self.base_url}job-listing/j?jl={job_id}"
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
job = job_data["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
- company_id = job_data['jobview']['header']['employer']['id']
+ company_id = job_data["jobview"]["header"]["employer"]["id"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
- date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None
+ date_diff = (datetime.now() - timedelta(days=age_in_days)).date()
+ date_posted = date_diff if age_in_days is not None else None
if location_type == "S":
is_remote = True
@@ -177,9 +188,10 @@ def _process_job(self, job_data):
description = self._fetch_job_description(job_id)
except:
description = None
+ company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
return JobPost(
title=title,
- company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
+ company_url=company_url if company_id else None,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
@@ -201,7 +213,7 @@ def _fetch_job_description(self, job_id):
"variables": {
"jl": job_id,
"queryString": "q",
- "pageTypeEnum": "SERP"
+ "pageTypeEnum": "SERP",
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
@@ -216,15 +228,17 @@ def _fetch_job_description(self, job_id):
__typename
}
}
- """
+ """,
}
]
res = requests.post(url, json=body, headers=self.headers)
if res.status_code != 200:
return None
data = res.json()[0]
- desc = data['data']['jobview']['job']['description']
- return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
+ desc = data["data"]["jobview"]["job"]["description"]
+ if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+ desc = markdown_converter(desc)
+ return desc
def _get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
@@ -234,10 +248,13 @@ def _get_location(self, location: str, is_remote: bool) -> (int, str):
res = self.session.get(url, headers=self.headers)
if res.status_code != 200:
if res.status_code == 429:
- logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
+ err = f"429 Response - Blocked by Glassdoor for too many requests"
+ logger.error(err)
return None, None
else:
- logger.error(f'Glassdoor response status code {res.status_code}')
+ err = f"Glassdoor response status code {res.status_code}"
+ err += f" - {res.text}"
+ logger.error(f"Glassdoor response status code {res.status_code}")
return None, None
items = res.json()
@@ -248,7 +265,7 @@ def _get_location(self, location: str, is_remote: bool) -> (int, str):
location_type = "CITY"
elif location_type == "S":
location_type = "STATE"
- elif location_type == 'N':
+ elif location_type == "N":
location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type
@@ -259,7 +276,9 @@ def _add_payload(
page_num: int,
cursor: str | None = None,
) -> str:
- fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
+ fromage = None
+ if self.scraper_input.hours_old:
+ fromage = max(self.scraper_input.hours_old // 24, 1)
filter_params = []
if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"})
@@ -278,9 +297,9 @@ def _add_payload(
"pageNumber": page_num,
"pageCursor": cursor,
"fromage": fromage,
- "sort": "date"
+ "sort": "date",
},
- "query": self.query_template
+ "query": self.query_template,
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append(
@@ -512,4 +531,4 @@ def get_cursor_for_page(pagination_cursors, page_num):
}
__typename
}
- """
\ No newline at end of file
+ """
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py
index 4211f4fc..3ff6bae0 100644
--- a/src/jobspy/scrapers/indeed/__init__.py
+++ b/src/jobspy/scrapers/indeed/__init__.py
@@ -4,9 +4,13 @@
This module contains routines to scrape Indeed.
"""
+
+from __future__ import annotations
+
import math
-from concurrent.futures import ThreadPoolExecutor, Future
+from typing import Tuple
from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, Future
import requests
@@ -15,7 +19,7 @@
extract_emails_from_text,
get_enum_from_job_type,
markdown_converter,
- logger
+ logger,
)
from ...jobs import (
JobPost,
@@ -24,7 +28,7 @@
Location,
JobResponse,
JobType,
- DescriptionFormat
+ DescriptionFormat,
)
@@ -54,30 +58,30 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com"
self.headers = self.api_headers.copy()
- self.headers['indeed-co'] = self.scraper_input.country.indeed_domain_value
+ self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
job_list = []
page = 1
cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages):
- logger.info(f'Indeed skipping search page: {page}')
+ logger.info(f"Indeed skipping search page: {page}")
__, cursor = self._scrape_page(cursor)
if not __:
- logger.info(f'Indeed found no jobs on page: {page}')
+ logger.info(f"Indeed found no jobs on page: {page}")
break
while len(self.seen_urls) < scraper_input.results_wanted:
- logger.info(f'Indeed search page: {page}')
+ logger.info(f"Indeed search page: {page}")
jobs, cursor = self._scrape_page(cursor)
if not jobs:
- logger.info(f'Indeed found no jobs on page: {page}')
+ logger.info(f"Indeed found no jobs on page: {page}")
break
job_list += jobs
page += 1
- return JobResponse(jobs=job_list[:scraper_input.results_wanted])
+ return JobResponse(jobs=job_list[: scraper_input.results_wanted])
- def _scrape_page(self, cursor: str | None) -> (list[JobPost], str | None):
+ def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param cursor:
@@ -86,31 +90,43 @@ def _scrape_page(self, cursor: str | None) -> (list[JobPost], str | None):
jobs = []
new_cursor = None
filters = self._build_filters()
+ location = (
+ self.scraper_input.location
+ or self.scraper_input.country.value[0].split(",")[-1]
+ )
query = self.job_search_query.format(
what=self.scraper_input.search_term,
- location=self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
+ location=location,
radius=self.scraper_input.distance,
dateOnIndeed=self.scraper_input.hours_old,
- cursor=f'cursor: "{cursor}"' if cursor else '',
- filters=filters
+ cursor=f'cursor: "{cursor}"' if cursor else "",
+ filters=filters,
)
payload = {
- 'query': query,
+ "query": query,
}
api_headers = self.api_headers.copy()
- api_headers['indeed-co'] = self.api_country_code
- response = requests.post(self.api_url, headers=api_headers, json=payload, proxies=self.proxy, timeout=10)
+ api_headers["indeed-co"] = self.api_country_code
+ response = requests.post(
+ self.api_url,
+ headers=api_headers,
+ json=payload,
+ proxies=self.proxy,
+ timeout=10,
+ )
if response.status_code != 200:
- logger.info(f'Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)')
+ logger.info(
+ f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
+ )
return jobs, new_cursor
data = response.json()
- jobs = data['data']['jobSearch']['results']
- new_cursor = data['data']['jobSearch']['pageInfo']['nextCursor']
+ jobs = data["data"]["jobSearch"]["results"]
+ new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [
- executor.submit(self._process_job, job['job']) for job in jobs
- ]
+ executor.submit(self._process_job, job["job"]) for job in jobs
+ ]
job_list = [result.result() for result in job_results if result.result()]
return job_list, new_cursor
@@ -128,7 +144,9 @@ def _build_filters(self):
start: "{start}h"
}}
}}
- """.format(start=self.scraper_input.hours_old)
+ """.format(
+ start=self.scraper_input.hours_old
+ )
elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP",
@@ -171,22 +189,24 @@ def _process_job(self, job: dict) -> JobPost | None:
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
- description = job['description']['html']
- description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
+ description = job["description"]["html"]
+ if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+ description = markdown_converter(description)
- job_type = self._get_job_type(job['attributes'])
+ job_type = self._get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
- employer = job['employer'].get('dossier') if job['employer'] else None
- employer_details = employer.get('employerDetails', {}) if employer else {}
+ employer = job["employer"].get("dossier") if job["employer"] else None
+ employer_details = employer.get("employerDetails", {}) if employer else {}
+ rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost(
title=job["title"],
description=description,
- company_name=job['employer'].get("name") if job.get('employer') else None,
- company_url=f"{self.base_url}{job['employer']['relativeCompanyPageUrl']}" if job[
- 'employer'] else None,
- company_url_direct=employer['links']['corporateWebsite'] if employer else None,
-
+ company_name=job["employer"].get("name") if job.get("employer") else None,
+ company_url=(f"{self.base_url}{rel_url}" if job["employer"] else None),
+ company_url_direct=(
+ employer["links"]["corporateWebsite"] if employer else None
+ ),
location=Location(
city=job.get("location", {}).get("city"),
state=job.get("location", {}).get("admin1Code"),
@@ -196,20 +216,39 @@ def _process_job(self, job: dict) -> JobPost | None:
compensation=self._get_compensation(job),
date_posted=date_posted,
job_url=job_url,
- job_url_direct=job['recruit'].get('viewJobUrl') if job.get('recruit') else None,
+ job_url_direct=(
+ job["recruit"].get("viewJobUrl") if job.get("recruit") else None
+ ),
emails=extract_emails_from_text(description) if description else None,
is_remote=self._is_job_remote(job, description),
-
- company_addresses=employer_details['addresses'][0] if employer_details.get('addresses') else None,
- company_industry=employer_details['industry'].replace('Iv1', '').replace('_', ' ').title() if employer_details.get('industry') else None,
- company_num_employees=employer_details.get('employeesLocalizedLabel'),
- company_revenue=employer_details.get('revenueLocalizedLabel'),
- company_description=employer_details.get('briefDescription'),
- ceo_name=employer_details.get('ceoName'),
- ceo_photo_url=employer_details.get('ceoPhotoUrl'),
-
- logo_photo_url=employer['images'].get('squareLogoUrl') if employer and employer.get('images') else None,
- banner_photo_url=employer['images'].get('headerImageUrl') if employer and employer.get('images') else None,
+ company_addresses=(
+ employer_details["addresses"][0]
+ if employer_details.get("addresses")
+ else None
+ ),
+ company_industry=(
+ employer_details["industry"]
+ .replace("Iv1", "")
+ .replace("_", " ")
+ .title()
+ if employer_details.get("industry")
+ else None
+ ),
+ company_num_employees=employer_details.get("employeesLocalizedLabel"),
+ company_revenue=employer_details.get("revenueLocalizedLabel"),
+ company_description=employer_details.get("briefDescription"),
+ ceo_name=employer_details.get("ceoName"),
+ ceo_photo_url=employer_details.get("ceoPhotoUrl"),
+ logo_photo_url=(
+ employer["images"].get("squareLogoUrl")
+ if employer and employer.get("images")
+ else None
+ ),
+ banner_photo_url=(
+ employer["images"].get("headerImageUrl")
+ if employer and employer.get("images")
+ else None
+ ),
)
@staticmethod
@@ -221,7 +260,7 @@ def _get_job_type(attributes: list) -> list[JobType]:
"""
job_types: list[JobType] = []
for attribute in attributes:
- job_type_str = attribute['label'].replace("-", "").replace(" ", "").lower()
+ job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
@@ -235,33 +274,41 @@ def _get_compensation(job: dict) -> Compensation | None:
:param job:
:return: compensation object
"""
- comp = job['compensation']['baseSalary']
- if comp:
- interval = IndeedScraper._get_compensation_interval(comp['unitOfWork'])
- if interval:
- return Compensation(
- interval=interval,
- min_amount=round(comp['range'].get('min'), 2) if comp['range'].get('min') is not None else None,
- max_amount=round(comp['range'].get('max'), 2) if comp['range'].get('max') is not None else None,
- currency=job['compensation']['currencyCode']
- )
+ comp = job["compensation"]["baseSalary"]
+ if not comp:
+ return None
+ interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
+ if not interval:
+ return None
+ min_range = comp["range"].get("min")
+ max_range = comp["range"].get("max")
+ return Compensation(
+ interval=interval,
+ min_amount=round(min_range, 2) if min_range is not None else None,
+ max_amount=round(max_range, 2) if max_range is not None else None,
+ currency=job["compensation"]["currencyCode"],
+ )
@staticmethod
def _is_job_remote(job: dict, description: str) -> bool:
"""
Searches the description, location, and attributes to check if job is remote
"""
- remote_keywords = ['remote', 'work from home', 'wfh']
+ remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
- any(keyword in attr['label'].lower() for keyword in remote_keywords)
- for attr in job['attributes']
+ any(keyword in attr["label"].lower() for keyword in remote_keywords)
+ for attr in job["attributes"]
+ )
+ is_remote_in_description = any(
+ keyword in description.lower() for keyword in remote_keywords
)
- is_remote_in_description = any(keyword in description.lower() for keyword in remote_keywords)
is_remote_in_location = any(
- keyword in job['location']['formatted']['long'].lower()
+ keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
- return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
+ return (
+ is_remote_in_attributes or is_remote_in_description or is_remote_in_location
+ )
@staticmethod
def _get_compensation_interval(interval: str) -> CompensationInterval:
@@ -270,7 +317,7 @@ def _get_compensation_interval(interval: str) -> CompensationInterval:
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
- "MONTH": "MONTHLY"
+ "MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
@@ -279,14 +326,14 @@ def _get_compensation_interval(interval: str) -> CompensationInterval:
raise ValueError(f"Unsupported interval: {interval}")
api_headers = {
- 'Host': 'apis.indeed.com',
- 'content-type': 'application/json',
- 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
- 'accept': 'application/json',
- 'indeed-locale': 'en-US',
- 'accept-language': 'en-US,en;q=0.9',
- 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1',
- 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
+ "Host": "apis.indeed.com",
+ "content-type": "application/json",
+ "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
+ "accept": "application/json",
+ "indeed-locale": "en-US",
+ "accept-language": "en-US,en;q=0.9",
+ "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
+ "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
}
job_search_query = """
query GetJobData {{
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index bb1fbcab..f3975383 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -4,6 +4,9 @@
This module contains routines to scrape LinkedIn.
"""
+
+from __future__ import annotations
+
import time
import random
from typing import Optional
@@ -24,14 +27,14 @@
JobType,
Country,
Compensation,
- DescriptionFormat
+ DescriptionFormat,
)
from ..utils import (
logger,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
- markdown_converter
+ markdown_converter,
)
@@ -61,26 +64,32 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
seconds_old = (
- scraper_input.hours_old * 3600
- if scraper_input.hours_old
- else None
+ scraper_input.hours_old * 3600 if scraper_input.hours_old else None
+ )
+ continue_search = (
+ lambda: len(job_list) < scraper_input.results_wanted and page < 1000
)
- continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search():
- logger.info(f'LinkedIn search page: {page // 25 + 1}')
+ logger.info(f"LinkedIn search page: {page // 25 + 1}")
session = create_session(is_tls=False, has_retry=True, delay=5)
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
- "f_JT": self.job_type_code(scraper_input.job_type)
- if scraper_input.job_type
- else None,
+ "f_JT": (
+ self.job_type_code(scraper_input.job_type)
+ if scraper_input.job_type
+ else None
+ ),
"pageNum": 0,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
- "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
+ "f_C": (
+ ",".join(map(str, scraper_input.linkedin_company_ids))
+ if scraper_input.linkedin_company_ids
+ else None
+ ),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
@@ -97,15 +106,19 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
- logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
+ err = (
+ f"429 Response - Blocked by LinkedIn for too many requests"
+ )
else:
- logger.error(f'LinkedIn response status code {response.status_code}')
+ err = f"LinkedIn response status code {response.status_code}"
+ err += f" - {response.text}"
+ logger.error(err)
return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
- logger.error(f'LinkedIn: Bad proxy')
+ logger.error(f"LinkedIn: Bad proxy")
else:
- logger.error(f'LinkedIn: {str(e)}')
+ logger.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")
@@ -126,11 +139,12 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
continue
seen_urls.add(job_url)
try:
- job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
+ fetch_desc = scraper_input.linkedin_fetch_description
+ job_post = self._process_job(job_card, job_url, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
- break
+ break
except Exception as e:
raise LinkedInException(str(e))
@@ -141,8 +155,10 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
- def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
- salary_tag = job_card.find('span', class_='job-search-card__salary-info')
+ def _process_job(
+ self, job_card: Tag, job_url: str, full_descr: bool
+ ) -> Optional[JobPost]:
+ salary_tag = job_card.find("span", class_="job-search-card__salary-info")
compensation = None
if salary_tag:
@@ -212,7 +228,9 @@ def _get_job_description(
"""
try:
session = create_session(is_tls=False, has_retry=True)
- response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
+ response = session.get(
+ job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
+ )
response.raise_for_status()
except:
return None, None
@@ -225,10 +243,12 @@ def _get_job_description(
)
description = None
if div_content is not None:
+
def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag
+
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
@@ -257,11 +277,8 @@ def _get_location(self, metadata_card: Optional[Tag]) -> Location:
)
elif len(parts) == 3:
city, state, country = parts
- location = Location(
- city=city,
- state=state,
- country=Country.from_string(country)
- )
+ country = Country.from_string(country)
+ location = Location(city=city, state=state, country=country)
return location
@staticmethod
diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py
index bdb8f355..844cf8b1 100644
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -1,9 +1,10 @@
-import logging
-import re
+from __future__ import annotations
-import numpy as np
+import re
+import logging
import requests
import tls_client
+import numpy as np
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
@@ -14,7 +15,8 @@
if not logger.handlers:
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+ formatter = logging.Formatter(format)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
@@ -33,7 +35,12 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
-def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
+def create_session(
+ proxy: dict | None = None,
+ is_tls: bool = True,
+ has_retry: bool = False,
+ delay: int = 1,
+) -> requests.Session:
"""
Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object
@@ -47,15 +54,17 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
if proxy:
session.proxies.update(proxy)
if has_retry:
- retries = Retry(total=3,
- connect=3,
- status=3,
- status_forcelist=[500, 502, 503, 504, 429],
- backoff_factor=delay)
+ retries = Retry(
+ total=3,
+ connect=3,
+ status=3,
+ status_forcelist=[500, 502, 503, 504, 429],
+ backoff_factor=delay,
+ )
adapter = HTTPAdapter(max_retries=retries)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
return session
@@ -73,17 +82,15 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
- cur_str = re.sub("[^-0-9.,]", '', cur_str)
+ cur_str = re.sub("[^-0-9.,]", "", cur_str)
# Remove any 000s separators (either , or .)
- cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
+ cur_str = re.sub("[.,]", "", cur_str[:-3]) + cur_str[-3:]
- if '.' in list(cur_str[-3:]):
+ if "." in list(cur_str[-3:]):
num = float(cur_str)
- elif ',' in list(cur_str[-3:]):
- num = float(cur_str.replace(',', '.'))
+ elif "," in list(cur_str[-3:]):
+ num = float(cur_str.replace(",", "."))
else:
num = float(cur_str)
return np.round(num, 2)
-
-
diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py
index 71529a3b..329de384 100644
--- a/src/jobspy/scrapers/ziprecruiter/__init__.py
+++ b/src/jobspy/scrapers/ziprecruiter/__init__.py
@@ -4,6 +4,9 @@
This module contains routines to scrape ZipRecruiter.
"""
+
+from __future__ import annotations
+
import math
import time
from datetime import datetime
@@ -16,7 +19,7 @@
logger,
extract_emails_from_text,
create_session,
- markdown_converter
+ markdown_converter,
)
from ...jobs import (
JobPost,
@@ -25,7 +28,7 @@
JobResponse,
JobType,
Country,
- DescriptionFormat
+ DescriptionFormat,
)
@@ -62,7 +65,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
break
if page > 1:
time.sleep(self.delay)
- logger.info(f'ZipRecruiter search page: {page}')
+ logger.info(f"ZipRecruiter search page: {page}")
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
@@ -88,25 +91,24 @@ def _find_jobs_in_page(
if continue_token:
params["continue_from"] = continue_token
try:
- res= self.session.get(
- f"{self.api_url}/jobs-app/jobs",
- headers=self.headers,
- params=params
+ res = self.session.get(
+ f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
)
if res.status_code not in range(200, 400):
if res.status_code == 429:
- logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests')
+ err = "429 Response - Blocked by ZipRecruiter for too many requests"
else:
- logger.error(f'ZipRecruiter response status code {res.status_code}')
+ err = f"ZipRecruiter response status code {res.status_code}"
+ err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
+ logger.error(err)
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
- logger.error(f'Indeed: Bad proxy')
+ logger.error(f"Indeed: Bad proxy")
else:
- logger.error(f'Indeed: {str(e)}')
+ logger.error(f"Indeed: {str(e)}")
return jobs_list, ""
-
res_data = res.json()
jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None)
@@ -127,7 +129,11 @@ def _process_job(self, job: dict) -> JobPost | None:
self.seen_urls.add(job_url)
description = job.get("job_description", "").strip()
- description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
+ description = (
+ markdown_converter(description)
+ if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
+ else description
+ )
company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value)
@@ -138,23 +144,22 @@ def _process_job(self, job: dict) -> JobPost | None:
job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower()
)
- date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
+ date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
+ comp_interval = job.get("compensation_interval")
+ comp_interval = "yearly" if comp_interval == "annual" else comp_interval
+ comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
+ comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
+ comp_currency = job.get("compensation_currency")
return JobPost(
title=title,
company_name=company,
location=location,
job_type=job_type,
compensation=Compensation(
- interval="yearly"
- if job.get("compensation_interval") == "annual"
- else job.get("compensation_interval"),
- min_amount=int(job["compensation_min"])
- if "compensation_min" in job
- else None,
- max_amount=int(job["compensation_max"])
- if "compensation_max" in job
- else None,
- currency=job.get("compensation_currency"),
+ interval=comp_interval,
+ min_amount=comp_min,
+ max_amount=comp_max,
+ currency=comp_currency,
),
date_posted=date_posted,
job_url=job_url,
@@ -163,8 +168,9 @@ def _process_job(self, job: dict) -> JobPost | None:
)
def _get_cookies(self):
- data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
- self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers)
+ data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
+ url = f"{self.api_url}/jobs-app/event"
+ self.session.post(url, data=data, headers=self.headers)
@staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
@@ -180,16 +186,13 @@ def _add_params(scraper_input) -> dict[str, str | Any]:
"location": scraper_input.location,
}
if scraper_input.hours_old:
- fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
- params['days'] = fromage
- job_type_map = {
- JobType.FULL_TIME: 'full_time',
- JobType.PART_TIME: 'part_time'
- }
+ params["days"] = max(scraper_input.hours_old // 24, 1)
+ job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
- params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
+ job_type = scraper_input.job_type
+ params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
- params['zipapply'] = 1
+ params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance: