python crawler package

Zepolimer · Zepolimer · commit 916fc8d2f2da · 2024-08-07T09:01:52.000+02:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,30 @@
+name: Python Crawler
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+  
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install setuptools wheel twine
+    - name: Test with unittest
+      run: |
+        python3 -m unittest discover .
+    - name: Creating package
+      run: |
+        python3 setup.py bdist_wheel
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+DS_Store
diff --git a/README.md b/README.md
@@ -1 +1,18 @@
-# python-crawler
+# python-crawler
+<br/>
+
+
+## Installation
+```commandline
+pip install git+ssh://git@github.com/Zepolimer/python-crawler.git@main#python-crawler
+```
+
+```commandline
+python3 -m unittest
+```
+
+```commandline
+rm build/ python_crawler.egg-info dist -Rf
+python3 setup.py bdist_wheel
+pip3 install -I dist/python_crawler-*-py3-none-any.whl
+```
diff --git a/python_crawler/__init__.py b/python_crawler/__init__.py
diff --git a/python_crawler/crawler/__init__.py b/python_crawler/crawler/__init__.py
@@ -0,0 +1,145 @@
+from playwright.sync_api import sync_playwright
+from playwright_stealth import stealth_sync
+
+from python_crawler.models import BrowserType, Location
+
+
+class Response:
+    def __init__(self):
+        self.status_code = 500
+        self.headers = {}
+        self.html = None
+        self.error = None
+
+
+class Request:
+    def __init__(self, crawler, url):
+        self.crawler = crawler
+        self.url = url
+        self.target_url = None
+        self.status_code = None
+        self.headers = {}
+
+        self.__title = None
+        self.__html = None
+
+    @property
+    def title(self):
+        if not self.__title:
+            self.__title = self.crawler.page.title()
+        return self.__title
+
+    @property
+    def html(self):
+        if not self.__html:
+            self.__html = self.crawler.page.content()
+        return self.__html
+
+    def get(self):
+        """
+        Try statement using crawler to handle request and get response status_code, headers and html
+            :return: Response
+        """
+        response = Response()
+
+        try:
+            self.crawler.new_page(self.url)
+
+            r = self.crawler.request
+            response.status_code = r.status_code
+            response.headers = r.headers
+            response.html = r.html
+        except Exception as e:
+            response.error = str(e)
+
+        self.crawler.close()
+        return response
+
+
+class GoogleRequest(Request):
+    def __init__(self, query, tld=Location.WORLDWIDE):
+        self.domain = 'google%s' % tld.value
+
+        super().__init__(
+            crawler=Crawler(
+                domain='.%s' % self.domain,
+                browser_type=BrowserType.CHROMIUM
+            ),
+            url='https://%s/search?%s' % (
+                self.domain,
+                query.encoded_str
+            )
+        )
+
+
+class BingRequest(Request):
+    def __init__(self, query, tld=Location.WORLDWIDE):
+        self.domain = 'bing%s' % tld.value
+
+        super().__init__(
+            crawler=Crawler(
+                domain='.%s' % self.domain,
+                browser_type=BrowserType.FIREFOX
+            ),
+            url='https://%s/search?%s' % (
+                self.domain,
+                query.encoded_str
+            )
+        )
+
+
+class CrawlerRequest:
+    def __init__(self, url):
+        self.url = url
+        self.target_url = None
+        self.status_code = None
+        self.headers = {}
+
+
+class Crawler:
+    def __init__(self, domain='.google.com', browser_type=BrowserType.CHROMIUM):
+        self.domain = domain
+        self.playwright = sync_playwright().start()
+
+        if browser_type == BrowserType.FIREFOX:
+            self.browser = self.playwright.firefox.launch(
+                headless=True,
+            )
+        else:
+            self.browser = self.playwright.chromium.launch(
+                headless=True,
+                args=['--single-process', '--no-zygote', '--no-sandbox']
+            )
+
+        self.context = self.browser.new_context()
+        self.context.add_cookies([
+            {
+                'name': 'SOCS',
+                'value': 'CAISHAgBEhJnd3NfMjAyMjA4MjktMF9SQzEaAmVuIAEaBgiB8U-YAg',
+                'domain': self.domain,
+                'path': '/'
+            }
+        ])
+
+        self.page = None
+        self.request = None
+
+    def new_page(self, url):
+        if self.page:
+            self.page.close()
+
+        self.request = Request(self, url)
+        self.page = self.context.new_page()
+        stealth_sync(self.page)
+        self.page.on("response", self.__handle_request__)
+        self.page.goto(self.request.url)
+
+    def __handle_request__(self, response):
+        if not self.request.target_url and response.status not in [302, 301]:
+            self.request.target_url = response.url
+            self.request.status_code = response.status
+            self.request.headers = response.headers
+
+    def close(self):
+        self.browser.close()
+        self.playwright.stop()
diff --git a/python_crawler/models/__init__.py b/python_crawler/models/__init__.py
@@ -0,0 +1,25 @@
+from enum import Enum
+
+
+class BrowserType(Enum):
+    CHROMIUM = 1, 'Chromium'
+    FIREFOX = 1, 'Firefox'
+
+
+class Location(Enum):
+    """
+        Feel free to add your languages if it's missing on the following list
+    """
+
+    WORLDWIDE = '.com'
+    AUSTRALIA = '.au'
+    BELGIUM = '.be'
+    CANADA = '.ca'
+    DENMARK = '.dk'
+    FRANCE = '.fr'
+    GERMANY = '.de'
+    IRELAND = '.ie'
+    ITALIA = '.it'
+    NEW_ZEALAND = '.nz'
+    SPAIN = '.es'
+    SWITZERLAND = '.ch'
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+requests
+playwright
+playwright_stealth
diff --git a/setup.py b/setup.py
@@ -0,0 +1,30 @@
+import setuptools
+
+from pathlib import Path
+from pkg_resources import parse_requirements
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+path = Path("requirements.txt")
+install_requires = [str(ir) for ir in parse_requirements(path.open())]
+
+setuptools.setup(
+    name="python_crawler",
+    version='0.0.1',
+    author="Rémi Lopez",
+    author_email="contact.remilopez@gmail.com",
+    description="Python open-source package : crawler using playwright",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/Zepolimer/python-crawler",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.10',
+    include_package_data=True,
+    install_requires=install_requires,
+)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+requests`
	`2`	`+playwright`
	`3`	`+playwright_stealth`