-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbukalapak.py
108 lines (91 loc) · 3.35 KB
/
bukalapak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
from urllib.parse import (
urlparse,
quote,
urlencode,
parse_qsl,
)
from logging import getLogger
from parser import Parser as BaseParser
QSTRING_KEY = quote('search[keywords]')
XPATH_TITLE = '//h1/text()'
XPATH_PRODUCT = \
'//div[contains(@class,"bl-flex-container flex-wrap is-gutter-16")]'\
'/div[contains(@class,"bl-flex-item")]'
XPATH_PRICE_DISC = '//div[contains(@class,"c-product-price -discounted")]'\
'/span/text()'
XPATH_PRICE_ORIG = '//div[contains(@class,"c-product-price -original")]'\
'/span/text()'
XPATH_IMAGE = '//meta[@property="og:image"]/@content'
XPATH_JSON = '//script[@type="application/ld+json"]/text()'
XPATH_SHOP_URL = '//a[@class="c-avatar"]/@href'
XPATH_CITY = '//a[contains(@class,"c-seller__city")]/text()'
XPATH_EMPTY = '//div[@class="c-main-product__unavailable"]'
class Parser(BaseParser):
XPATH_INFO = '//table[@class="c-information__table"]/tbody/tr'
XPATH_DESC = '//div[@class="c-information__description-txt"]'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.meta = dict()
for xs in self.response.xpath(XPATH_JSON):
s = xs.extract()
d = json.loads(s)
if d['@type'] == 'Product':
self.meta = d
break
@classmethod
def start_url(self, keywords):
value = quote(keywords)
return f'https://www.bukalapak.com/products?{QSTRING_KEY}={value}'
@classmethod
def get_product_urls(cls, response) -> list: # Override
log = getLogger('get_product_urls()')
r = []
for xs in response.xpath(XPATH_PRODUCT):
url = xs.xpath('observer-tracker/div/div/div/div')
if not url:
continue
url = url[1]
url = url.xpath('p/a/@href').extract()
if not url:
return r
url = url[0]
r.append(url)
return r
@classmethod
def next_page_url(cls, response) -> str: # Override
p = urlparse(response.url)
d = dict(parse_qsl(p.query))
if 'page' in d:
page = int(d['page'])
else:
page = 1
page += 1
d.update(dict(page=page))
return f'{p.scheme}://{p.netloc}{p.path}?' + urlencode(d)
def is_product_list(self):
s = self.response.body.lower()
return s.find(b'hasil pencarian') > -1
def get_info(self) -> dict:
return super().get_info(3)
def get_title(self) -> str: # Override
return self.response.xpath(XPATH_TITLE).extract()[0]
def get_price(self) -> float: # Override
xs = self.response.xpath(XPATH_PRICE_DISC)
if not xs:
xs = self.response.xpath(XPATH_PRICE_ORIG)
v = xs.extract()[0]
v = v.lstrip('Rp').replace('.', '')
return float(v)
def get_image(self) -> str: # Override
return self.response.xpath(XPATH_IMAGE).extract()[0]
def get_shop_name(self) -> str:
return self.meta['offers']['seller']['name']
def get_shop_url(self) -> str:
return self.response.xpath(XPATH_SHOP_URL).extract()[0]
def get_city(self) -> str:
return self.response.xpath(XPATH_CITY).extract()[0]
def get_stock(self) -> int:
if self.response.xpath(XPATH_EMPTY):
return 0
return 1