enh(linkedin): company logo

cullenwatson · Apr 30, 2024 · fe40012 · fe40012
1 parent a65408f
commit fe40012
Show file tree

Hide file tree

Showing 7 changed files with 1,073 additions and 971 deletions.
diff --git a/examples/JobSpy_AllSites.py b/examples/JobSpy_AllSites.py
@@ -27,4 +27,4 @@
 # jobs.to_xlsx('jobs.xlsx', index=False)
 
 # 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
-# display(jobs)
+# display(jobs)
diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py
@@ -32,17 +32,18 @@
                 search_term="software engineer",
                 # New York, NY
                 # Dallas, TX
-
                 # Los Angeles, CA
                 location="Los Angeles, CA",
-                results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
+                results_wanted=min(
+                    results_in_each_iteration, results_wanted - len(all_jobs)
+                ),
                 country_indeed="USA",
                 offset=offset,
                 # proxy="http://jobspy:[email protected]:20001",
             )
 
             # Add the scraped jobs to the list
-            all_jobs.extend(jobs.to_dict('records'))
+            all_jobs.extend(jobs.to_dict("records"))
 
             # Increment the offset for the next page of results
             offset += results_in_each_iteration

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,8 +25,8 @@ regex = "^2024.4.28"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.1"
 jupyter = "^1.0.0"
-black = "^24.2.0"
-pre-commit = "^3.6.2"
+black = "*"
+pre-commit = "*"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from abc import ABC, abstractmethod
+
 from ..jobs import (
     Enum,
     BaseModel,
@@ -36,9 +38,10 @@ class ScraperInput(BaseModel):
     hours_old: int | None = None
 
 
-class Scraper:
+class Scraper(ABC):
     def __init__(self, site: Site, proxy: list[str] | None = None):
         self.site = site
         self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
 
+    @abstractmethod
     def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
@@ -197,15 +197,16 @@ def _process_job(
             if metadata_card
             else None
         )
-        date_posted = description = job_type = job_url_direct = None
+        date_posted = None
         if datetime_tag and "datetime" in datetime_tag.attrs:
             datetime_str = datetime_tag["datetime"]
             try:
                 date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
             except:
                 date_posted = None
+        job_details = {}
         if full_descr:
-            description, job_type, job_url_direct = self._get_job_description(job_url)
+            job_details = self._get_job_details(job_url)
 
         return JobPost(
             title=title,
@@ -214,22 +215,19 @@ def _process_job(
             location=location,
             date_posted=date_posted,
             job_url=job_url,
-            job_url_direct=job_url_direct,
             compensation=compensation,
-            job_type=job_type,
-            description=description,
-            emails=extract_emails_from_text(description) if description else None,
+            job_type=job_details.get("job_type"),
+            description=job_details.get("description"),
+            job_url_direct=job_details.get("job_url_direct"),
+            emails=extract_emails_from_text(job_details.get("description")),
+            logo_photo_url=job_details.get("logo_photo_url"),
         )
 
-    def _get_job_description(
-        self, job_page_url: str
-    ) -> tuple[None, None, None] | tuple[
-        str | None, tuple[str | None, JobType | None], str | None
-    ]:
+    def _get_job_details(self, job_page_url: str) -> dict:
         """
-        Retrieves job description by going to the job page url
+        Retrieves job description and other job details by going to the job page url
         :param job_page_url:
-        :return: description or None
+        :return: dict
         """
         try:
             session = create_session(is_tls=False, has_retry=True)
@@ -238,9 +236,9 @@ def _get_job_description(
             )
             response.raise_for_status()
         except:
-            return None, None
+            return {}
         if response.url == "https://www.linkedin.com/signup":
-            return None, None
+            return {}
 
         soup = BeautifulSoup(response.text, "html.parser")
         div_content = soup.find(
@@ -258,7 +256,14 @@ def remove_attributes(tag):
             description = div_content.prettify(formatter="html")
             if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                 description = markdown_converter(description)
-        return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
+        return {
+            "description": description,
+            "job_type": self._parse_job_type(soup),
+            "job_url_direct": self._parse_job_url_direct(soup),
+            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
+                "data-delayed-url"
+            ),
+        }
 
     def _get_location(self, metadata_card: Optional[Tag]) -> Location:
         """

diff --git a/test.py b/test.py