Multiple job types for Indeed, urgent keywords column (#56)

* enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme
cullenwatson · Oct 10, 2023 · e5353e6 · e5353e6
1 parent 628f4de
commit e5353e6
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 244 deletions.
diff --git a/README.md b/README.md
@@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
 
 ```python
 from jobspy import scrape_jobs
-import pandas as pd
 
-jobs: pd.DataFrame = scrape_jobs(
+jobs = scrape_jobs(
     site_name=["indeed", "linkedin", "zip_recruiter"],
     search_term="software engineer",
     location="Dallas, TX",
     results_wanted=10,
-
     country_indeed='USA'  # only needed for indeed
-
-    # use if you want to use a proxy
-    # proxy="http://jobspy:[email protected]:20001",
-    # offset=25 # use if you want to start at a specific offset
 )
+print(f"Found {len(jobs)} jobs")
+print(jobs.head())
+jobs.to_csv("jobs.csv", index=False)
 
-# formatting for pandas
-pd.set_option('display.max_columns', None)
-pd.set_option('display.max_rows', None)
-pd.set_option('display.width', None)
-pd.set_option('display.max_colwidth', 50)  # set to 0 to see full job url / desc
-
-# 1 output to console
-print(jobs)
-
-# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
-# display(jobs)
-
-# 3 output to .csv
-# jobs.to_csv('jobs.csv', index=False)
-
-# 4 output to .xlsx
+# output to .xlsx
 # jobs.to_xlsx('jobs.xlsx', index=False)
 
 ```
@@ -117,6 +99,9 @@ JobPost
 │   ├── max_amount (int)
 │   └── currency (enum)
 └── date_posted (date)
+└── emails (str)
+└── num_urgent_words (int)
+└── is_remote (bool) - just for Indeed at the momen
 ```
 
 ### Exceptions

diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_Demo.py
@@ -6,23 +6,23 @@
     search_term="software engineer",
     location="Dallas, TX",
     results_wanted=50,  # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
-    country_indeed='USA',
+    country_indeed="USA",
     offset=25  # start jobs from an offset (use if search failed and want to continue)
     # proxy="http://jobspy:[email protected]:20001",
 )
 
 # formatting for pandas
-pd.set_option('display.max_columns', None)
-pd.set_option('display.max_rows', None)
-pd.set_option('display.width', None)
-pd.set_option('display.max_colwidth', 50)  # set to 0 to see full job url / desc
+pd.set_option("display.max_columns", None)
+pd.set_option("display.max_rows", None)
+pd.set_option("display.width", None)
+pd.set_option("display.max_colwidth", 50)  # set to 0 to see full job url / desc
 
 # 1: output to console
 print(jobs)
 
 # 2: output to .csv
-jobs.to_csv('./jobs.csv', index=False)
-print('outputted to jobs.csv')
+jobs.to_csv("./jobs.csv", index=False)
+print("outputted to jobs.csv")
 
 # 3: output to .xlsx
 # jobs.to_xlsx('jobs.xlsx', index=False)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.12"
+version = "1.1.13"
 description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
 authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
 homepage = "https://github.com/cullenwatson/JobSpy"

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Tuple, Optional
+from typing import Tuple, Optional
 
 from .jobs import JobType, Location
 from .scrapers.indeed import IndeedScraper
@@ -26,18 +26,18 @@ def _map_str_to_site(site_name: str) -> Site:
 
 
 def scrape_jobs(
-        site_name: str | List[str] | Site | List[Site],
-        search_term: str,
-        location: str = "",
-        distance: int = None,
-        is_remote: bool = False,
-        job_type: str = None,
-        easy_apply: bool = False,  # linkedin
-        results_wanted: int = 15,
-        country_indeed: str = "usa",
-        hyperlinks: bool = False,
-        proxy: Optional[str] = None,
-        offset: Optional[int] = 0
+    site_name: str | list[str] | Site | list[Site],
+    search_term: str,
+    location: str = "",
+    distance: int = None,
+    is_remote: bool = False,
+    job_type: str = None,
+    easy_apply: bool = False,  # linkedin
+    results_wanted: int = 15,
+    country_indeed: str = "usa",
+    hyperlinks: bool = False,
+    proxy: Optional[str] = None,
+    offset: Optional[int] = 0,
 ) -> pd.DataFrame:
     """
     Simultaneously scrapes job data from multiple job sites.
@@ -72,7 +72,7 @@ def get_enum_from_value(value_str):
         job_type=job_type,
         easy_apply=easy_apply,
         results_wanted=results_wanted,
-        offset=offset
+        offset=offset,
     )
 
     def scrape_site(site: Site) -> Tuple[str, JobResponse]:
@@ -98,8 +98,8 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
     site_to_jobs_dict = {}
 
     def worker(site):
-        site_value, scraped_data = scrape_site(site)
-        return site_value, scraped_data
+        site_val, scraped_info = scrape_site(site)
+        return site_val, scraped_info
 
     with ThreadPoolExecutor() as executor:
         future_to_site = {
@@ -110,7 +110,7 @@ def worker(site):
             site_value, scraped_data = future.result()
             site_to_jobs_dict[site_value] = scraped_data
 
-    jobs_dfs: List[pd.DataFrame] = []
+    jobs_dfs: list[pd.DataFrame] = []
 
     for site, job_response in site_to_jobs_dict.items():
         for job in job_response.jobs:
@@ -120,12 +120,14 @@ def worker(site):
             ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
             job_data["site"] = site
             job_data["company"] = job_data["company_name"]
-            if job_data["job_type"]:
-                # Take the first value from the job type tuple
-                job_data["job_type"] = job_data["job_type"].value[0]
-            else:
-                job_data["job_type"] = None
-
+            job_data["job_type"] = (
+                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
+                if job_data["job_type"]
+                else None
+            )
+            job_data["emails"] = (
+                ", ".join(job_data["emails"]) if job_data["emails"] else None
+            )
             job_data["location"] = Location(**job_data["location"]).display_location()
 
             compensation_obj = job_data.get("compensation")
@@ -149,7 +151,7 @@ def worker(site):
 
     if jobs_dfs:
         jobs_df = pd.concat(jobs_dfs, ignore_index=True)
-        desired_order: List[str] = [
+        desired_order: list[str] = [
             "job_url_hyper" if hyperlinks else "job_url",
             "site",
             "title",
@@ -158,12 +160,13 @@ def worker(site):
             "job_type",
             "date_posted",
             "interval",
-            "benefits",
             "min_amount",
             "max_amount",
             "currency",
+            "is_remote",
+            "num_urgent_words",
+            "benefits",
             "emails",
-            "job_url_hyper" if hyperlinks else "job_url",
             "description",
         ]
         jobs_formatted_df = jobs_df[desired_order]

diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
@@ -182,12 +182,15 @@ class JobPost(BaseModel):
     job_url: str
     location: Optional[Location]
 
-    description: Optional[str] = None
-    job_type: Optional[JobType] = None
-    compensation: Optional[Compensation] = None
-    date_posted: Optional[date] = None
-    benefits: Optional[str] = None
-    emails: Optional[list[str]] = None
+    description: str | None = None
+    job_type: list[JobType] | None = None
+    compensation: Compensation | None = None
+    date_posted: date | None = None
+    benefits: str | None = None
+    emails: list[str] | None = None
+    num_urgent_words: int | None = None
+    is_remote: bool | None = None
+    # company_industry: str | None = None
 
 
 class JobResponse(BaseModel):