Skip to content

Commit 916fc8d

Browse files
committed
python crawler package
1 parent fb1491c commit 916fc8d

File tree

8 files changed

+253
-2
lines changed

8 files changed

+253
-2
lines changed

.github/workflows/main.yml

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Python Crawler
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
build:
11+
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- uses: actions/checkout@v3
16+
- name: Set up Python 3.10
17+
uses: actions/setup-python@v1
18+
with:
19+
python-version: '3.10'
20+
- name: Install dependencies
21+
run: |
22+
python -m pip install --upgrade pip
23+
pip install -r requirements.txt
24+
pip install setuptools wheel twine
25+
- name: Test with unittest
26+
run: |
27+
python3 -m unittest discover .
28+
- name: Creating package
29+
run: |
30+
python3 setup.py bdist_wheel

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,5 @@ cython_debug/
159159
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160160
# and can be added to the global gitignore or merged into this file. For a more nuclear
161161
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162-
#.idea/
162+
.idea/
163+
DS_Store

README.md

+18-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,18 @@
1-
# python-crawler
1+
# python-crawler
2+
<br/>
3+
4+
5+
## Installation
6+
```commandline
7+
pip install git+ssh://[email protected]/Zepolimer/python-crawler.git@main#python-crawler
8+
```
9+
10+
```commandline
11+
python3 -m unittest
12+
```
13+
14+
```commandline
15+
rm build/ python_crawler.egg-info dist -Rf
16+
python3 setup.py bdist_wheel
17+
pip3 install -I dist/python_crawler-*-py3-none-any.whl
18+
```

python_crawler/__init__.py

Whitespace-only changes.

python_crawler/crawler/__init__.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from playwright.sync_api import sync_playwright
2+
from playwright_stealth import stealth_sync
3+
4+
from python_crawler.models import BrowserType, Location
5+
6+
7+
class Response:
8+
def __init__(self):
9+
self.status_code = 500
10+
self.headers = {}
11+
self.html = None
12+
self.error = None
13+
14+
15+
class Request:
16+
def __init__(self, crawler, url):
17+
self.crawler = crawler
18+
self.url = url
19+
self.target_url = None
20+
self.status_code = None
21+
self.headers = {}
22+
23+
self.__title = None
24+
self.__html = None
25+
26+
@property
27+
def title(self):
28+
if not self.__title:
29+
self.__title = self.crawler.page.title()
30+
return self.__title
31+
32+
@property
33+
def html(self):
34+
if not self.__html:
35+
self.__html = self.crawler.page.content()
36+
return self.__html
37+
38+
def get(self):
39+
"""
40+
Try statement using crawler to handle request and get response status_code, headers and html
41+
:return: Response
42+
"""
43+
response = Response()
44+
45+
try:
46+
self.crawler.new_page(self.url)
47+
48+
r = self.crawler.request
49+
response.status_code = r.status_code
50+
response.headers = r.headers
51+
response.html = r.html
52+
except Exception as e:
53+
response.error = str(e)
54+
55+
self.crawler.close()
56+
return response
57+
58+
59+
class GoogleRequest(Request):
60+
def __init__(self, query, tld=Location.WORLDWIDE):
61+
self.domain = 'google%s' % tld.value
62+
63+
super().__init__(
64+
crawler=Crawler(
65+
domain='.%s' % self.domain,
66+
browser_type=BrowserType.CHROMIUM
67+
),
68+
url='https://%s/search?%s' % (
69+
self.domain,
70+
query.encoded_str
71+
)
72+
)
73+
74+
75+
class BingRequest(Request):
76+
def __init__(self, query, tld=Location.WORLDWIDE):
77+
self.domain = 'bing%s' % tld.value
78+
79+
super().__init__(
80+
crawler=Crawler(
81+
domain='.%s' % self.domain,
82+
browser_type=BrowserType.FIREFOX
83+
),
84+
url='https://%s/search?%s' % (
85+
self.domain,
86+
query.encoded_str
87+
)
88+
)
89+
90+
91+
class CrawlerRequest:
92+
def __init__(self, url):
93+
self.url = url
94+
self.target_url = None
95+
self.status_code = None
96+
self.headers = {}
97+
98+
99+
class Crawler:
100+
def __init__(self, domain='.google.com', browser_type=BrowserType.CHROMIUM):
101+
self.domain = domain
102+
self.playwright = sync_playwright().start()
103+
104+
if browser_type == BrowserType.FIREFOX:
105+
self.browser = self.playwright.firefox.launch(
106+
headless=True,
107+
)
108+
else:
109+
self.browser = self.playwright.chromium.launch(
110+
headless=True,
111+
args=['--single-process', '--no-zygote', '--no-sandbox']
112+
)
113+
114+
self.context = self.browser.new_context()
115+
self.context.add_cookies([
116+
{
117+
'name': 'SOCS',
118+
'value': 'CAISHAgBEhJnd3NfMjAyMjA4MjktMF9SQzEaAmVuIAEaBgiB8U-YAg',
119+
'domain': self.domain,
120+
'path': '/'
121+
}
122+
])
123+
124+
self.page = None
125+
self.request = None
126+
127+
def new_page(self, url):
128+
if self.page:
129+
self.page.close()
130+
131+
self.request = Request(self, url)
132+
self.page = self.context.new_page()
133+
stealth_sync(self.page)
134+
self.page.on("response", self.__handle_request__)
135+
self.page.goto(self.request.url)
136+
137+
def __handle_request__(self, response):
138+
if not self.request.target_url and response.status not in [302, 301]:
139+
self.request.target_url = response.url
140+
self.request.status_code = response.status
141+
self.request.headers = response.headers
142+
143+
def close(self):
144+
self.browser.close()
145+
self.playwright.stop()

python_crawler/models/__init__.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from enum import Enum
2+
3+
4+
class BrowserType(Enum):
5+
CHROMIUM = 1, 'Chromium'
6+
FIREFOX = 1, 'Firefox'
7+
8+
9+
class Location(Enum):
10+
"""
11+
Feel free to add your languages if it's missing on the following list
12+
"""
13+
14+
WORLDWIDE = '.com'
15+
AUSTRALIA = '.au'
16+
BELGIUM = '.be'
17+
CANADA = '.ca'
18+
DENMARK = '.dk'
19+
FRANCE = '.fr'
20+
GERMANY = '.de'
21+
IRELAND = '.ie'
22+
ITALIA = '.it'
23+
NEW_ZEALAND = '.nz'
24+
SPAIN = '.es'
25+
SWITZERLAND = '.ch'

requirements.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests
2+
playwright
3+
playwright_stealth

setup.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import setuptools
2+
3+
from pathlib import Path
4+
from pkg_resources import parse_requirements
5+
6+
with open("README.md", "r") as fh:
7+
long_description = fh.read()
8+
9+
path = Path("requirements.txt")
10+
install_requires = [str(ir) for ir in parse_requirements(path.open())]
11+
12+
setuptools.setup(
13+
name="python_crawler",
14+
version='0.0.1',
15+
author="Rémi Lopez",
16+
author_email="[email protected]",
17+
description="Python open-source package : crawler using playwright",
18+
long_description=long_description,
19+
long_description_content_type="text/markdown",
20+
url="https://github.com/Zepolimer/python-crawler",
21+
packages=setuptools.find_packages(),
22+
classifiers=[
23+
"Programming Language :: Python :: 3",
24+
"License :: OSI Approved :: MIT License",
25+
"Operating System :: OS Independent",
26+
],
27+
python_requires='>=3.10',
28+
include_package_data=True,
29+
install_requires=install_requires,
30+
)

0 commit comments

Comments
 (0)