Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Job type #106

Merged
merged 3 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.43"
version = "1.1.44"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,4 @@ def worker(site):
else:
jobs_formatted_df = pd.DataFrame()

return jobs_formatted_df.sort_values(by='date_posted', ascending=False)
return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
195 changes: 174 additions & 21 deletions src/jobspy/scrapers/glassdoor/__init__.py

Large diffs are not rendered by default.

94 changes: 14 additions & 80 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from typing import Any
from datetime import datetime

import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
Expand All @@ -22,7 +21,7 @@
extract_emails_from_text,
create_session,
get_enum_from_job_type,
modify_and_get_description
logger
)
from ...jobs import (
JobPost,
Expand Down Expand Up @@ -50,13 +49,14 @@ def __init__(self, proxy: str | None = None):

def scrape_page(
self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int]:
) -> list[JobPost]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
self.country = scraper_input.country
domain = self.country.indeed_domain_value
self.url = f"https://{domain}.indeed.com"
Expand All @@ -76,14 +76,14 @@ def scrape_page(
)
except Exception as e:
if "Proxy responded with" in str(e):
raise IndeedException("bad proxy")
raise IndeedException(str(e))
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list

soup = BeautifulSoup(response.content, "html.parser")
job_list = []
total_num_jobs = IndeedScraper.total_jobs(soup)
if "did not match any jobs" in response.text:
return job_list, total_num_jobs
return job_list

jobs = IndeedScraper.parse_jobs(
soup
Expand Down Expand Up @@ -145,15 +145,15 @@ def process_job(job: dict, job_detailed: dict) -> JobPost | None:

job_list = [result.result() for result in job_results if result.result()]

return job_list, total_num_jobs
return job_list

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
job_list, total_results = self.scrape_page(scraper_input, 0)
job_list = self.scrape_page(scraper_input, 0)
pages_processed = 1

while len(self.seen_urls) < scraper_input.results_wanted:
Expand All @@ -167,7 +167,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
]

for future in futures:
jobs, _ = future.result()
jobs = future.result()
if jobs:
job_list += jobs
new_jobs = True
Expand All @@ -182,55 +182,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]

job_response = JobResponse(
jobs=job_list,
total_results=total_results,
)
return job_response

def get_description(self, job_page_url: str) -> str | None:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
session = create_session(self.proxy)

try:
response = session.get(
formatted_url,
headers=self.get_headers(),
allow_redirects=True,
timeout_seconds=5,
)
except Exception as e:
return None

if response.status_code not in range(200, 400):
return None

try:
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')

job_description = ''
for tag in script_tags:
if 'window._initialData' in tag.text:
json_str = tag.text
json_str = json_str.split('window._initialData=')[1]
json_str = json_str.rsplit(';', 1)[0]
data = json.loads(json_str)
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
break
except (KeyError, TypeError, IndexError):
return None

soup = BeautifulSoup(job_description, "html.parser")
return modify_and_get_description(soup)
return JobResponse(jobs=job_list)

@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
Expand Down Expand Up @@ -330,24 +282,6 @@ def find_mosaic_script() -> Tag | None:
"Could not find any results for the search"
)

@staticmethod
def total_jobs(soup: BeautifulSoup) -> int:
"""
Parses the total jobs for that search from soup object
:param soup:
:return: total_num_jobs
"""
script = soup.find("script", string=lambda t: t and "window._initialData" in t)

pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
match = pattern.search(script.string)
total_num_jobs = 0
if match:
json_str = match.group(1)
data = json.loads(json_str)
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs

@staticmethod
def get_headers():
return {
Expand Down Expand Up @@ -380,7 +314,7 @@ def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value))
sc_values.append("jt({})".format(scraper_input.job_type.value[0]))

if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
Expand All @@ -406,7 +340,7 @@ def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", [])
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy

def get_job_details(self, job_keys: list[str]) -> dict:
"""
Expand Down
14 changes: 9 additions & 5 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
modify_and_get_description
currency_parser
)


Expand Down Expand Up @@ -236,10 +235,15 @@ def get_job_description(
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)

description = None
if div_content:
description = modify_and_get_description(div_content)
if div_content is not None:
def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag

div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")

def get_job_type(
soup_job_type: BeautifulSoup,
Expand Down
20 changes: 12 additions & 8 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import logging
import numpy as np

import tls_client
Expand All @@ -7,14 +8,14 @@

from ..jobs import JobType


def modify_and_get_description(soup):
for li in soup.find_all('li'):
li.string = "- " + li.get_text()

description = soup.get_text(separator='\n').strip()
description = re.sub(r'\n+', '\n', description)
return description
logger = logging.getLogger("JobSpy")
if not logger.handlers:
logger.setLevel(logging.ERROR)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


def count_urgent_words(description: str) -> int:
Expand Down Expand Up @@ -79,6 +80,7 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
res = job_type
return res


def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
Expand All @@ -94,3 +96,5 @@ def currency_parser(cur_str):
num = float(cur_str)

return np.round(num, 2)


28 changes: 8 additions & 20 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
from datetime import datetime, timezone
from typing import Optional, Tuple, Any

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
from ..utils import count_urgent_words, extract_emails_from_text, create_session


class ZipRecruiterScraper(Scraper):
Expand Down Expand Up @@ -107,9 +106,7 @@ def process_job(self, job: dict) -> JobPost | None:
return
self.seen_urls.add(job_url)

job_description_html = job.get("job_description", "").strip()
description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup)
description = job.get("job_description", "").strip()

company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada"
Expand Down Expand Up @@ -168,25 +165,16 @@ def add_params(scraper_input) -> dict[str, str | Any]:
if scraper_input.hours_old:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params['days'] = fromage
job_type_value = None
job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime":
job_type_value = "full_time"
elif scraper_input.job_type.value == "parttime":
job_type_value = "part_time"
else:
job_type_value = scraper_input.job_type.value
params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
if scraper_input.easy_apply:
params['zipapply'] = 1

if job_type_value:
params[
"refine_by_employment"
] = f"employment_type:employment_type:{job_type_value}"

if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote"

params["remote"] = 1
if scraper_input.distance:
params["radius"] = scraper_input.distance

Expand Down
Loading