Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Description format #107

Merged
merged 2 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ work with us.*

- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxy support (HTTP/S, SOCKS)
- Proxy support

[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
Expand Down Expand Up @@ -67,12 +67,13 @@ Optional
├── location (int)
├── distance (int): in miles
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool)
├── full_description (bool): fetches full description for LinkedIn (slower)
├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.44"
version = "1.1.45"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand All @@ -18,6 +18,7 @@ beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"
html2text = "^2020.1.16"


[tool.poetry.group.dev.dependencies]
Expand Down
54 changes: 18 additions & 36 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,6 @@
GlassdoorException,
)

SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}


def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]


def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
Expand All @@ -39,7 +28,8 @@ def scrape_jobs(
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: str | None = None,
full_description: bool | None = False,
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
hours_old: int = None,
Expand All @@ -49,6 +39,15 @@ def scrape_jobs(
Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}

def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]

def get_enum_from_value(value_str):
for job_type in JobType:
Expand All @@ -61,16 +60,15 @@ def get_enum_from_value(value_str):
def get_site_type():
site_types = list(Site)
if isinstance(site_name, str):
site_types = [_map_str_to_site(site_name)]
site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site):
site_types = [site_name]
elif isinstance(site_name, list):
site_types = [
_map_str_to_site(site) if isinstance(site, str) else site
map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types

country_enum = Country.from_string(country_indeed)

scraper_input = ScraperInput(
Expand All @@ -82,7 +80,8 @@ def get_site_type():
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
full_description=full_description,
description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
Expand All @@ -92,22 +91,7 @@ def get_site_type():
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)

try:
scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
if site == Site.LINKEDIN:
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException(str(e))
if site == Site.GLASSDOOR:
raise GlassdoorException(str(e))
else:
raise e
scraped_data: JobResponse = scraper.scrape(scraper_input)
return site.value, scraped_data

site_to_jobs_dict = {}
Expand Down Expand Up @@ -188,8 +172,6 @@ def worker(site):
"emails",
"description",
]
jobs_formatted_df = jobs_df[desired_order]
return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
else:
jobs_formatted_df = pd.DataFrame()

return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
return pd.DataFrame()
5 changes: 5 additions & 0 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ class Compensation(BaseModel):
currency: Optional[str] = "USD"


class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"


class JobPost(BaseModel):
title: str
company_name: str
Expand Down
12 changes: 10 additions & 2 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat
)


class Site(Enum):
Expand All @@ -18,9 +25,10 @@ class ScraperInput(BaseModel):
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
full_description: bool = False
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN

results_wanted: int = 15
hours_old: int | None = None
Expand Down
Loading
Loading