diff --git a/main.py b/main.py
index 1976460..638d7a8 100644
--- a/main.py
+++ b/main.py
@@ -25,7 +25,7 @@ class Item(BaseModel):
@app.put("/backend/")
async def returnSearch(item: Item):
- item.data=toriScraper(item.product, item.priceMin, item.priceMax, item.distance, item.city)
+ item.data=toriScraper(item.product, item.priceMin, item.priceMax, item.distance, item.city, 0)
return(item)
diff --git a/scraper.py b/scraper.py
index acd8aab..9f0df43 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,193 +1,86 @@
import requests
from bs4 import BeautifulSoup
-import datetime
-import json
from sanakirja import sanakirja
-def toriScraper(productName, priceMin, priceMax, etäisyys, kaupunki):
- runWait=30
+def toriScraper(productName, priceMin, priceMax, etäisyys, kaupunki, timeSinceLastCheck):
+ # TimeSinceLastCheck=0 means that this is a request made by frontend
+ # TimeSinceLastCheck should be in minutes and less than 60
- today=datetime.date.today()
- #tämä joskus featureksi siten, että käyttäjä voi itse inputaa kuinka vanhoja tuloksia maksimissaan haluaa
- latest="3 päivää sitten"
- if latest and "päivä" in latest:
- if latest.split(" ")[0] == "päivä":
- daysToAdd=1
- else:
- daysToAdd=float(latest.split(" ")[0])
- latest=today - datetime.timedelta(days=daysToAdd)
-
- productNameFixed=str(productName).lower().split(" ")
- product=""
- plus=len(productNameFixed)
- no_plus=1
- for word in productNameFixed:
- no_plus+=1
- product+=word.strip()
- if plus>=no_plus:
- product+="+"
-
- #Tänä osa on sitä varten, jos joskus halutaan, että ohjelma hakee VAIN läheltä torista. ilman tätä käy koko suomen läpi
- #givenLocation=str(input()).lower()
- #givenLocation=""
-
- #locationsDict={
- # "ahvenanmaa":"0.100015",
- # "etelä-karjala":"0.100014",
- # "etelä-pohjanmaa":"0.100006",
- # "etelä-savo":"0.100013",
- # "kainuu":"0.100003",
- # "keski-pohjanmaa":"0.100004",
- # "keski-suomi":"0.100007",
- # "kymenlaakso":"0.100020",
- # "lappi":"0.100001",
- # "pirkanmaa":"0.100011",
- # "pohjanmaa":"0.100005",
- # "pohjois-karjala":"0.100009",
- # "pohjois-pohjanmaa":"0.100002",
- # "pohjois-savo":"0.100008",
- # "päijät-häme":"0.100012",
- # "satakunta":"0.100010",
- # "uusimaa":"0.100018",
- # "varsinais-suomi":"0.100016"
- #}
- #if givenLocation:
- # location=locationsDict[givenLocation]
- # page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?location={location}&q={product}&sort=PUBLISHED_DESC")
- # print(f"https://www.tori.fi/recommerce/forsale/search?location={location}&q={product}&sort=PUBLISHED_DESC")
- #else:
- # page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?q={product}&sort=PUBLISHED_DESC")
- # print(f"https://www.tori.fi/recommerce/forsale/search?q={product}&sort=PUBLISHED_DESC")
+ if timeSinceLastCheck==0:
+ latest=3
+ # This is a placeholder value
+ else:
+ # Set latest = 1 to only search for results from today
+ latest=1
+
+ # Make product name readable by tori search algorithm
+ product=str(productName).lower().replace(" ", "+")
+ # Check what cities products can be from
allCities=sanakirja[kaupunki]
allowedCities=[]
for city in allCities:
if city[1]<=etäisyys:
allowedCities.append(city[0].lower())
allowedCities.append(kaupunki)
- print(allowedCities)
+
+ # Add to the end of link to include giving away type posts when 0€ is a valid price
+ if priceMin<=0:
+ freeIncludedLink="&trade_type=2"
+ else:
+ freeIncludedLink=""
foundListings=[]
- page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?q={product}&sort=PUBLISHED_DESC")
- print(f"https://www.tori.fi/recommerce/forsale/search?q={product}&sort=PUBLISHED_DESC")
+ page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?price_from={priceMin}&price_to={priceMax}&published={latest}&q={product}&sort=PUBLISHED_DESC&trade_type=1{freeIncludedLink}")
+ print(f"https://www.tori.fi/recommerce/forsale/search?price_from={priceMin}&price_to={priceMax}&published={latest}&q={product}&sort=PUBLISHED_DESC&trade_type=1{freeIncludedLink}")
if page.status_code==200:
page=page.content
parsedPage=BeautifulSoup(page, 'html.parser')
- announcements=parsedPage.find('div', attrs={'class':'grid grid-cols-2 md:grid-cols-3 grid-flow-row-dense gap-16 items-start sf-result-list mt-16'})
- sales=announcements.find_all('article')
-
- for sale in sales:
- link=str(sale.find('a', attrs={'class':'sf-search-ad-link s-text! hover:no-underline'})).split("href")[1].split(">")[0].replace("href","").replace('"','').split("=")[1].replace(" id","")
- if "paalupaikka" in str(sale.find('span', attrs={'class':'absolute top-0 left-0 pointer-events-none badge--positionTL badge--info'})).lower():
- continue
- name=str(sale.find('a')).split("")[-1].replace("","")
- if "ostetaan" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("","").replace("€","").strip().lower():
- continue
- elif "annetaan" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("","").replace("€","").strip().lower():
- price=0
- elif "myydään" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("","").replace("€","").strip().lower():
- continue
- else:
- price=int(str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("\xa0","").replace("","").replace("","").replace("€","").strip())
- when=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[0].replace("","").replace('','')
- place=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[2].replace("","").split(",")[0].lower()
- if place not in allowedCities:
- continue
- if latest:
- if "päivä" in when:
- if when.split(" ")[0] == "päivä":
- when=today-datetime.timedelta(days=1)
- else:
- daysToAdd=float(when.split(" ")[0])
- when=today-datetime.timedelta(days=daysToAdd)
- if latest>when:
- return foundListings
- else:
- if "minuutti" in when:
- if "minuutti" in when.split(" ")[0]:
- time=1
- else:
- time=int(when.split(" ")[0])
- if time>runWait*2:
- return foundListings
- else:
- return foundListings
- if price<=priceMax and price >= priceMin:
- print(name)
- print(price)
- print(place)
- print(when)
- dataOfProduct=f"nimi: {name}, hinta: {price}, paikka: {place}, milloin: {when}, linkki: {link}"
- foundListings.append(dataOfProduct)
+ # Calculate how many pages are there
pages=parsedPage.find('nav', attrs={'class':'flex items-center justify-center p-8 mt-24'})
pages=pages.find('div',attrs={'class':'hidden md:block s-text-link'})
pages=str(pages).count("Sivu")
if pages:
- for whichPage in range (2,51):
-
- #if givenLocation:
- # location=locationsDict[givenLocation]
- # page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?location={location}&page={whichPage}&q={product}&sort=PUBLISHED_DESC")
- # print(f"https://www.tori.fi/recommerce/forsale/search?location={location}&page={whichPage}&q={product}&sort=PUBLISHED_DESC")
- #else:
- # page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?page={whichPage}&q={product}&sort=PUBLISHED_DESC")
- # print(f"https://www.tori.fi/recommerce/forsale/search?page={whichPage}&q={product}&sort=PUBLISHED_DESC")
-
- page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?page={whichPage}&q={product}&sort=PUBLISHED_DESC")
- print(f"https://www.tori.fi/recommerce/forsale/search?page={whichPage}&q={product}&sort=PUBLISHED_DESC")
-
- if page.status_code==200:
- page=page.content
- parsedPage=BeautifulSoup(page, 'html.parser')
+ for whichPage in range (1,pages+1):
+ # The first page is already loaded so we do not need to load it again
+ if(whichPage!=1):
+ page=requests.get(f"https://www.tori.fi/recommerce/forsale/search?price_from={priceMin}&price_to={priceMax}&published={latest}&q={product}&sort=PUBLISHED_DESC&trade_type=1{freeIncludedLink}")
+ print(f"https://www.tori.fi/recommerce/forsale/search?price_from={priceMin}&price_to={priceMax}&published={latest}&q={product}&sort=PUBLISHED_DESC&trade_type=1{freeIncludedLink}")
+ if page.status_code==200:
+ page=page.content
+ parsedPage=BeautifulSoup(page, 'html.parser')
+
+ announcements=parsedPage.find('div', attrs={'class':'grid grid-cols-2 md:grid-cols-3 grid-flow-row-dense gap-16 items-start sf-result-list mt-16'})
+ sales=announcements.find_all('article')
+ for sale in sales:
+ link=str(sale.find('a', attrs={'class':'sf-search-ad-link s-text! hover:no-underline'})).split("href")[1].split(">")[0].replace("href","").replace('"','').split("=")[1].replace(" id","")
+ # Don't include sales tagged paalupaikka. Later change this to include them only once
+ if "paalupaikka" in str(sale.find('span', attrs={'class':'absolute top-0 left-0 pointer-events-none badge--positionTL badge--info'})).lower():
+ continue
+ name=str(sale.find('a')).split("")[-1].replace("","")
+ # Set price of items that are being given away as free
+ if "annetaan" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("
","").replace("€","").strip().lower():
+ price=0
+ else:
+ price=int(str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("\xa0","").replace("","").replace("","").replace("€","").strip())
+ when=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[0].replace("","").replace('','')
+ place=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[2].replace("","").split(",")[0].lower()
+
+ if not place in allowedCities:
+ continue
- announcements=parsedPage.find('div', attrs={'class':'grid grid-cols-2 md:grid-cols-3 grid-flow-row-dense gap-16 items-start sf-result-list mt-16'})
- sales=announcements.find_all('article')
- for sale in sales:
- link=str(sale.find('a', attrs={'class':'sf-search-ad-link s-text! hover:no-underline'})).split("href")[1].split(">")[0].replace("href","").replace('"','').split("=")[1].replace(" id","")
- if "paalupaikka" in str(sale.find('span', attrs={'class':'absolute top-0 left-0 pointer-events-none badge--positionTL badge--info'})).lower():
- continue
- name=str(sale.find('a')).split("")[-1].replace("","")
- if "ostetaan" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("
","").replace("€","").strip().lower():
- continue
- elif "annetaan" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("","").replace("€","").strip().lower():
- price=0
- elif "myydään" in str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("","").replace("","").replace("€","").strip().lower():
- continue
- else:
- price=int(str(sale.find('div', attrs={'class':'mt-16 flex justify-between sm:mt-8 space-x-12 font-bold whitespace-nowrap'})).split("")[-1].replace("\xa0","").replace("","").replace("","").replace("€","").strip())
- when=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[0].replace("","").replace('','')
- place=str(sale.find('div', attrs={'class':'text-s s-text-subtle mx-16 mt-8'})).split("")[2].replace("","").split(",")[0].lower()
- if not place in allowedCities:
- continue
- if latest:
- if "päivä" in when:
- if when.split(" ")[0] == "päivä":
- when=today-datetime.timedelta(days=1)
- else:
- daysToAdd=float(when.split(" ")[0])
- when=today-datetime.timedelta(days=daysToAdd)
- if latest>when:
- print(foundListings)
- return foundListings
- else:
- if "minuutti" in when:
- if "minuutti" in when.split(" ")[0]:
- time=1
- time=int(when.split(" ")[0])
- if time>runWait*2:
- return foundListings
+ if timeSinceLastCheck!=0:
+ #check if post has been made since the last check
+ if "minuutti" in when:
+ if "minuutti" in when.split(" ")[0]:
+ time=1
else:
+ time=int(when.split(" ")[0])
+ if time>timeSinceLastCheck:
return foundListings
- if price<=priceMax and price >= priceMin:
- print(name)
- print(price)
- print(place)
- print(when)
- dataOfProduct=f"nimi: {name}, hinta: {price}, paikka: {place}, milloin: {when}, linkki: {link}"
- foundListings.append([name, price, place, when, link])
- pages=parsedPage.find('nav', attrs={'class':'flex items-center justify-center p-8 mt-24'})
- pages=pages.find('div',attrs={'class':'hidden md:block s-text-link'})
- if not f"Sivu {whichPage+1}" in str(pages):
- print(foundListings)
- return foundListings
+ else:
+ return foundListings
+ foundListings.append([name, price, place, when, link])
+ return foundListings
+ return [["no pages found"]]