-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBeautifulSoup-scrape
77 lines (67 loc) · 2.44 KB
/
BeautifulSoup-scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
def car_scraping(pages):
global Price
Make,Model,Rest,Mileage,Price,Price_drop,Dealer,Rating,Used,Year=[],[],[],[],[],[],[],[],[],[]
for page in range(pages):
url=f"https://www.cars.com/shopping/results/?page={page}&page_size=20&dealer_id=&list_price_max=&list_price_min=&makes[]=&maximum_distance=20&mileage_max=&sort=best_match_desc&stock_type=all&year_max=&year_min=&zip="
html=urlopen(url)
soup=BeautifulSoup(html,"html.parser")
for i in soup.findAll("div",{"class":"vehicle-card-main"}):
# print(i.findAll("p",{"class":"stock-type"})[0].text)
usage=i.findAll("p",{"class":"stock-type"})[0].text
# print(i.findAll("h2",{"class":"title"})[0].text.split()[0])
year=i.findAll("h2",{"class":"title"})[0].text.split()[0]
car=i.findAll("h2",{"class":"title"})[0].text.split()[1:]
make=car[0]#
model=car[1]
price=i.findAll("span",{"class","primary-price"})[0].text[1:]
mileage=i.findAll("div",{"class","mileage"})[0].get_text().replace("mi.","")
# print(mileage[0].get_text().replace("mi.",""))
price_drop=i.findAll("span",{"class":"price-drop"})
# print(price_drop)
for y in price_drop:
drop_price=y.text.replace("price drop","").strip("$")
Price_drop.append(drop_price)
dealer=i.findAll("strong")[0].text
rating=i.findAll("span",class_="sds-rating__count")
ratings=([(rate.text) for rate in rating])
if ratings:
Rating.append(ratings[0])
else:
Rating.append(None)
Used.append(usage)
Year.append(year)
Make.append(make)
Model.append(model)
Mileage.append(mileage)
Price.append(price)
Dealer.append(dealer)
Mileage=[float(i.replace(",",".")) for i in Mileage]
cost=[]
global df
for i in Price:
try:
i=i.replace(",",".")
i=float(i)
cost.append(i)
except:
i=None
cost.append(i)
Price=cost
drop=[]
for i in Price_drop:
if i:
i=i.replace(",","")
drop.append(int(i))
else:
i=0
drop.append(i)
Price_drop=drop
features=Make,Model,Mileage,Price,Price_drop,Dealer,Rating,Used,Year
df=pd.DataFrame(features).T
df.Price=Price
df.columns=["Make","Model","Mileage","Price","Price_drop","Dealer","Rating","Used","Year"]
return df
data=car_scraping(200)#First 200 pages