Skip to content

Commit

Permalink
enh(linkedin): search by company ids
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Feb 4, 2024
1 parent 02caf1b commit ba8e315
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 34 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.41"
version = "1.1.42"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
43 changes: 25 additions & 18 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
from typing import Tuple
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import Tuple, Optional

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
Expand Down Expand Up @@ -29,19 +29,20 @@ def _map_str_to_site(site_name: str) -> Site:


def scrape_jobs(
site_name: str | list[str] | Site | list[Site],
search_term: str,
location: str = "",
distance: int = None,
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
location: str | None = None,
distance: int | None = None,
is_remote: bool = False,
job_type: str = None,
easy_apply: bool = False, # linkedin
job_type: str | None = None,
easy_apply: bool | None = None,
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: Optional[str] = None,
full_description: Optional[bool] = False,
offset: Optional[int] = 0,
proxy: str | None = None,
full_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
) -> pd.DataFrame:
"""
Simultaneously scrapes job data from multiple job sites.
Expand All @@ -56,18 +57,23 @@ def get_enum_from_value(value_str):

job_type = get_enum_from_value(job_type) if job_type else None

if type(site_name) == str:
site_type = [_map_str_to_site(site_name)]
else: #: if type(site_name) == list
site_type = [
_map_str_to_site(site) if type(site) == str else site_name
for site in site_name
]
def get_site_type():
site_types = list(Site)
if isinstance(site_name, str):
site_types = [_map_str_to_site(site_name)]
elif isinstance(site_name, Site):
site_types = [site_name]
elif isinstance(site_name, list):
site_types = [
_map_str_to_site(site) if isinstance(site, str) else site
for site in site_name
]
return site_types

country_enum = Country.from_string(country_indeed)

scraper_input = ScraperInput(
site_type=site_type,
site_type=get_site_type(),
country=country_enum,
search_term=search_term,
location=location,
Expand All @@ -77,6 +83,7 @@ def get_enum_from_value(value_str):
easy_apply=easy_apply,
full_description=full_description,
results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids,
offset=offset,
)

Expand Down
18 changes: 9 additions & 9 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country
from typing import List, Optional, Any


class Site(Enum):
Expand All @@ -10,23 +9,24 @@ class Site(Enum):


class ScraperInput(BaseModel):
site_type: List[Site]
search_term: str
site_type: list[Site]
search_term: str | None = None

location: str = None
country: Optional[Country] = Country.USA
distance: Optional[int] = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin
job_type: JobType | None = None
easy_apply: bool | None = None
full_description: bool = False
offset: int = 0
linkedin_company_ids: list[int] | None = None

results_wanted: int = 15


class Scraper:
def __init__(self, site: Site, proxy: Optional[List[str]] = None):
def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)

Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def is_remote_job(job: dict) -> bool:
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
params = {
"q": scraper_input.search_term,
"l": scraper_input.location,
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date"
Expand Down
10 changes: 7 additions & 3 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def job_type_code(job_type_enum):

return mapping.get(job_type_enum, "")

while len(job_list) < scraper_input.results_wanted and page < 1000:
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000

while continue_search():
session = create_session(is_tls=False, has_retry=True, delay=5)
params = {
"keywords": scraper_input.search_term,
Expand All @@ -83,6 +85,7 @@ def job_type_code(job_type_enum):
"pageNum": 0,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None
}

params = {k: v for k, v in params.items() if v is not None}
Expand Down Expand Up @@ -130,8 +133,9 @@ def job_type_code(job_type_enum):
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")

page += 25
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
if continue_search():
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
page += 25

job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
Expand Down
9 changes: 7 additions & 2 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, proxy: Optional[str] = None):

self.jobs_per_page = 20
self.seen_urls = set()
self.delay = 5

def find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None
Expand Down Expand Up @@ -59,7 +60,6 @@ def find_jobs_in_page(
raise ZipRecruiterException("bad proxy")
raise ZipRecruiterException(str(e))

time.sleep(5)
response_data = response.json()
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get("continue", None)
Expand All @@ -85,6 +85,9 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
if len(job_list) >= scraper_input.results_wanted:
break

if page > 1:
time.sleep(self.delay)

jobs_on_page, continue_token = self.find_jobs_in_page(
scraper_input, continue_token
)
Expand All @@ -108,7 +111,7 @@ def process_job(self, job: dict) -> JobPost | None:
description_soup = BeautifulSoup(job_description_html, "html.parser")
description = modify_and_get_description(description_soup)

company = job["hiring_company"].get("name") if "hiring_company" in job else None
company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value)

Expand Down Expand Up @@ -184,6 +187,8 @@ def add_params(scraper_input) -> dict[str, str | Any]:
if scraper_input.distance:
params["radius"] = scraper_input.distance

params = {k: v for k, v in params.items() if v is not None}

return params

@staticmethod
Expand Down

0 comments on commit ba8e315

Please sign in to comment.