Skip to content

Commit

Permalink
Multiple job types for Indeed, urgent keywords column (#56)
Browse files Browse the repository at this point in the history
* enh(indeed): mult job types

* feat(jobs):  urgent kws

* fix(indeed): use new session obj per request

* fix: emails as comma separated in output

* fix: put num urgent words in output

* chore: readme
  • Loading branch information
cullenwatson authored Oct 10, 2023
1 parent 628f4de commit e5353e6
Show file tree
Hide file tree
Showing 12 changed files with 268 additions and 244 deletions.
31 changes: 8 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)

```python
from jobspy import scrape_jobs
import pandas as pd

jobs: pd.DataFrame = scrape_jobs(
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer",
location="Dallas, TX",
results_wanted=10,

country_indeed='USA' # only needed for indeed

# use if you want to use a proxy
# proxy="http://jobspy:[email protected]:20001",
# offset=25 # use if you want to start at a specific offset
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", index=False)

# formatting for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc

# 1 output to console
print(jobs)

# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)

# 3 output to .csv
# jobs.to_csv('jobs.csv', index=False)

# 4 output to .xlsx
# output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False)

```
Expand Down Expand Up @@ -117,6 +99,9 @@ JobPost
│ ├── max_amount (int)
│ └── currency (enum)
└── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool) - just for Indeed at the momen
```

### Exceptions
Expand Down
14 changes: 7 additions & 7 deletions examples/JobSpy_Demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,23 @@
search_term="software engineer",
location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
country_indeed='USA',
country_indeed="USA",
offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:[email protected]:20001",
)

# formatting for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc

# 1: output to console
print(jobs)

# 2: output to .csv
jobs.to_csv('./jobs.csv', index=False)
print('outputted to jobs.csv')
jobs.to_csv("./jobs.csv", index=False)
print("outputted to jobs.csv")

# 3: output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.12"
version = "1.1.13"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/cullenwatson/JobSpy"
Expand Down
55 changes: 29 additions & 26 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, Optional
from typing import Tuple, Optional

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
Expand All @@ -26,18 +26,18 @@ def _map_str_to_site(site_name: str) -> Site:


def scrape_jobs(
site_name: str | List[str] | Site | List[Site],
search_term: str,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: str = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: Optional[str] = None,
offset: Optional[int] = 0
site_name: str | list[str] | Site | list[Site],
search_term: str,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: str = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: Optional[str] = None,
offset: Optional[int] = 0,
) -> pd.DataFrame:
"""
Simultaneously scrapes job data from multiple job sites.
Expand Down Expand Up @@ -72,7 +72,7 @@ def get_enum_from_value(value_str):
job_type=job_type,
easy_apply=easy_apply,
results_wanted=results_wanted,
offset=offset
offset=offset,
)

def scrape_site(site: Site) -> Tuple[str, JobResponse]:
Expand All @@ -98,8 +98,8 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
site_to_jobs_dict = {}

def worker(site):
site_value, scraped_data = scrape_site(site)
return site_value, scraped_data
site_val, scraped_info = scrape_site(site)
return site_val, scraped_info

with ThreadPoolExecutor() as executor:
future_to_site = {
Expand All @@ -110,7 +110,7 @@ def worker(site):
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data

jobs_dfs: List[pd.DataFrame] = []
jobs_dfs: list[pd.DataFrame] = []

for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
Expand All @@ -120,12 +120,14 @@ def worker(site):
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
if job_data["job_type"]:
# Take the first value from the job type tuple
job_data["job_type"] = job_data["job_type"].value[0]
else:
job_data["job_type"] = None

job_data["job_type"] = (
", ".join(job_type.value[0] for job_type in job_data["job_type"])
if job_data["job_type"]
else None
)
job_data["emails"] = (
", ".join(job_data["emails"]) if job_data["emails"] else None
)
job_data["location"] = Location(**job_data["location"]).display_location()

compensation_obj = job_data.get("compensation")
Expand All @@ -149,7 +151,7 @@ def worker(site):

if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: List[str] = [
desired_order: list[str] = [
"job_url_hyper" if hyperlinks else "job_url",
"site",
"title",
Expand All @@ -158,12 +160,13 @@ def worker(site):
"job_type",
"date_posted",
"interval",
"benefits",
"min_amount",
"max_amount",
"currency",
"is_remote",
"num_urgent_words",
"benefits",
"emails",
"job_url_hyper" if hyperlinks else "job_url",
"description",
]
jobs_formatted_df = jobs_df[desired_order]
Expand Down
15 changes: 9 additions & 6 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,15 @@ class JobPost(BaseModel):
job_url: str
location: Optional[Location]

description: Optional[str] = None
job_type: Optional[JobType] = None
compensation: Optional[Compensation] = None
date_posted: Optional[date] = None
benefits: Optional[str] = None
emails: Optional[list[str]] = None
description: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
benefits: str | None = None
emails: list[str] | None = None
num_urgent_words: int | None = None
is_remote: bool | None = None
# company_industry: str | None = None


class JobResponse(BaseModel):
Expand Down
Loading

0 comments on commit e5353e6

Please sign in to comment.