Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove pandas warning #118

Merged
merged 3 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.45"
version = "1.1.46"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand All @@ -13,12 +13,12 @@ packages = [
[tool.poetry.dependencies]
python = "^3.10"
requests = "^2.31.0"
tls-client = "*"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"
html2text = "^2020.1.16"
tls-client = "^1.0.1"


[tool.poetry.group.dev.dependencies]
Expand Down
22 changes: 19 additions & 3 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,14 @@ def worker(site):
jobs_dfs.append(job_df)

if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: list[str] = [
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]

# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)

# Desired column order
desired_order = [
"job_url_hyper" if hyperlinks else "job_url",
"site",
"title",
Expand All @@ -172,6 +178,16 @@ def worker(site):
"emails",
"description",
]
return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])

# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty

# Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order]

# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
else:
return pd.DataFrame()
14 changes: 8 additions & 6 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
if not new_jobs:
break


if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]

Expand Down Expand Up @@ -124,12 +123,15 @@ def _scrape_page(self, page: int=0) -> list[JobPost]:
return job_list

jobs = IndeedScraper._parse_jobs(soup)
if not jobs:
return []
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
logger.error("Indeed - No jobs found.")
return []

jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
Expand Down Expand Up @@ -302,11 +304,11 @@ def find_mosaic_script() -> Tag | None:
jobs = json.loads(m.group(1).strip())
return jobs
else:
raise IndeedException("Could not find mosaic provider job cards data")
logger.warning(f'Indeed: Could not find mosaic provider job cards data')
return {}
else:
raise IndeedException(
"Could not find any results for the search"
)
logger.warning(f"Indeed: Could not parse any jobs on the page")
return {}

@staticmethod
def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
return JobResponse(job_list=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
logger.error(f'LinkedIn: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
logger.error(f'LinkedIn: {str(e)}')
return JobResponse(job_list=job_list)

soup = BeautifulSoup(response.text, "html.parser")
Expand Down
Loading