Skip to content

Commit

Permalink
enh: description format
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Feb 14, 2024
1 parent aeb1a50 commit 191a5ea
Show file tree
Hide file tree
Showing 11 changed files with 575 additions and 576 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool)
├── full_description (bool): fetches full description for LinkedIn (slower)
├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.44"
version = "1.1.45"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand All @@ -18,6 +18,7 @@ beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"
html2text = "^2020.1.16"


[tool.poetry.group.dev.dependencies]
Expand Down
54 changes: 18 additions & 36 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,6 @@
GlassdoorException,
)

SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}


def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]


def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
Expand All @@ -39,7 +28,8 @@ def scrape_jobs(
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: str | None = None,
full_description: bool | None = False,
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
hours_old: int = None,
Expand All @@ -49,6 +39,15 @@ def scrape_jobs(
Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}

def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]

def get_enum_from_value(value_str):
for job_type in JobType:
Expand All @@ -61,16 +60,15 @@ def get_enum_from_value(value_str):
def get_site_type():
site_types = list(Site)
if isinstance(site_name, str):
site_types = [_map_str_to_site(site_name)]
site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site):
site_types = [site_name]
elif isinstance(site_name, list):
site_types = [
_map_str_to_site(site) if isinstance(site, str) else site
map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types

country_enum = Country.from_string(country_indeed)

scraper_input = ScraperInput(
Expand All @@ -82,7 +80,8 @@ def get_site_type():
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
full_description=full_description,
description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
Expand All @@ -92,22 +91,7 @@ def get_site_type():
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)

try:
scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
if site == Site.LINKEDIN:
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException(str(e))
if site == Site.GLASSDOOR:
raise GlassdoorException(str(e))
else:
raise e
scraped_data: JobResponse = scraper.scrape(scraper_input)
return site.value, scraped_data

site_to_jobs_dict = {}
Expand Down Expand Up @@ -188,8 +172,6 @@ def worker(site):
"emails",
"description",
]
jobs_formatted_df = jobs_df[desired_order]
return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
else:
jobs_formatted_df = pd.DataFrame()

return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
return pd.DataFrame()
5 changes: 5 additions & 0 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ class Compensation(BaseModel):
currency: Optional[str] = "USD"


class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"


class JobPost(BaseModel):
title: str
company_name: str
Expand Down
12 changes: 10 additions & 2 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat
)


class Site(Enum):
Expand All @@ -18,9 +25,10 @@ class ScraperInput(BaseModel):
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
full_description: bool = False
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN

results_wanted: int = 15
hours_old: int | None = None
Expand Down
Loading

0 comments on commit 191a5ea

Please sign in to comment.