Skip to content

Commit

Permalink
enh(linkedin): company logo
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Apr 30, 2024
1 parent a65408f commit fe40012
Show file tree
Hide file tree
Showing 7 changed files with 1,073 additions and 971 deletions.
2 changes: 1 addition & 1 deletion examples/JobSpy_AllSites.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@
# jobs.to_xlsx('jobs.xlsx', index=False)

# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)
# display(jobs)
7 changes: 4 additions & 3 deletions examples/JobSpy_LongScrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@
search_term="software engineer",
# New York, NY
# Dallas, TX

# Los Angeles, CA
location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
results_wanted=min(
results_in_each_iteration, results_wanted - len(all_jobs)
),
country_indeed="USA",
offset=offset,
# proxy="http://jobspy:[email protected]:20001",
)

# Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records'))
all_jobs.extend(jobs.to_dict("records"))

# Increment the offset for the next page of results
offset += results_in_each_iteration
Expand Down
1,989 changes: 1,041 additions & 948 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "^24.2.0"
pre-commit = "^3.6.2"
black = "*"
pre-commit = "*"

[build-system]
requires = ["poetry-core"]
Expand Down
5 changes: 4 additions & 1 deletion src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from abc import ABC, abstractmethod

from ..jobs import (
Enum,
BaseModel,
Expand Down Expand Up @@ -36,9 +38,10 @@ class ScraperInput(BaseModel):
hours_old: int | None = None


class Scraper:
class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)

@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
37 changes: 21 additions & 16 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,16 @@ def _process_job(
if metadata_card
else None
)
date_posted = description = job_type = job_url_direct = None
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
job_details = {}
if full_descr:
description, job_type, job_url_direct = self._get_job_description(job_url)
job_details = self._get_job_details(job_url)

return JobPost(
title=title,
Expand All @@ -214,22 +215,19 @@ def _process_job(
location=location,
date_posted=date_posted,
job_url=job_url,
job_url_direct=job_url_direct,
compensation=compensation,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
job_type=job_details.get("job_type"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
)

def _get_job_description(
self, job_page_url: str
) -> tuple[None, None, None] | tuple[
str | None, tuple[str | None, JobType | None], str | None
]:
def _get_job_details(self, job_page_url: str) -> dict:
"""
Retrieves job description by going to the job page url
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: description or None
:return: dict
"""
try:
session = create_session(is_tls=False, has_retry=True)
Expand All @@ -238,9 +236,9 @@ def _get_job_description(
)
response.raise_for_status()
except:
return None, None
return {}
if response.url == "https://www.linkedin.com/signup":
return None, None
return {}

soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
Expand All @@ -258,7 +256,14 @@ def remove_attributes(tag):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
return {
"description": description,
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url"
),
}

def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Expand Down
Empty file removed test.py
Empty file.

0 comments on commit fe40012

Please sign in to comment.