diff --git a/crawler/crawl.py b/crawler/crawl.py index e324cd6..fc80460 100644 --- a/crawler/crawl.py +++ b/crawler/crawl.py @@ -1,36 +1,34 @@ -from dataclasses import dataclass -import os import datetime -from typing import List import logging +import os +from dataclasses import dataclass from pathlib import Path from time import time +from typing import List - +from crawler.store.boso import BosoCrawler +from crawler.store.brodokomerc import BrodokomercCrawler +from crawler.store.dm import DmCrawler +from crawler.store.eurospin import EurospinCrawler +from crawler.store.jadranka_trgovina import JadrankaTrgovinaCrawler +from crawler.store.kaufland import KauflandCrawler from crawler.store.konzum import KonzumCrawler +from crawler.store.ktc import KtcCrawler from crawler.store.lidl import LidlCrawler +from crawler.store.lorenco import LorencoCrawler +from crawler.store.metro import MetroCrawler +from crawler.store.ntl import NtlCrawler +from crawler.store.output import copy_archive_info, create_archive, save_chain from crawler.store.plodine import PlodineCrawler from crawler.store.ribola import RibolaCrawler from crawler.store.roto import RotoCrawler from crawler.store.spar import SparCrawler from crawler.store.studenac import StudenacCrawler from crawler.store.tommy import TommyCrawler -from crawler.store.kaufland import KauflandCrawler -from crawler.store.eurospin import EurospinCrawler -from crawler.store.dm import DmCrawler -from crawler.store.ktc import KtcCrawler -from crawler.store.metro import MetroCrawler from crawler.store.trgocentar import TrgocentarCrawler -from crawler.store.zabac import ZabacCrawler -from crawler.store.vrutak import VrutakCrawler -from crawler.store.ntl import NtlCrawler from crawler.store.trgovina_krk import TrgovinaKrkCrawler -from crawler.store.brodokomerc import BrodokomercCrawler -from crawler.store.lorenco import LorencoCrawler -from crawler.store.boso import BosoCrawler - - -from crawler.store.output import save_chain, copy_archive_info, create_archive +from crawler.store.vrutak import VrutakCrawler +from crawler.store.zabac import ZabacCrawler logger = logging.getLogger(__name__) @@ -56,6 +54,7 @@ BrodokomercCrawler.CHAIN: BrodokomercCrawler, LorencoCrawler.CHAIN: LorencoCrawler, BosoCrawler.CHAIN: BosoCrawler, + JadrankaTrgovinaCrawler.CHAIN: JadrankaTrgovinaCrawler, } diff --git a/crawler/store/base.py b/crawler/store/base.py index 9eb8911..3c05115 100644 --- a/crawler/store/base.py +++ b/crawler/store/base.py @@ -1,16 +1,16 @@ +import datetime +import unicodedata from csv import DictReader -from decimal import Decimal, InvalidOperation, ROUND_HALF_UP +from decimal import ROUND_HALF_UP, Decimal, InvalidOperation from logging import getLogger +from re import Pattern from tempfile import NamedTemporaryFile -from typing import Any, BinaryIO, Generator from time import time +from typing import Any, BinaryIO, Generator from zipfile import ZipFile -import datetime -from bs4 import BeautifulSoup -from re import Pattern -import unicodedata import httpx +from bs4 import BeautifulSoup from .models import Product, Store diff --git a/crawler/store/boso.py b/crawler/store/boso.py index 3595684..b5dc6c2 100644 --- a/crawler/store/boso.py +++ b/crawler/store/boso.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from crawler.store.models import Store + from .base import BaseCrawler logger = logging.getLogger(__name__) diff --git a/crawler/store/brodokomerc.py b/crawler/store/brodokomerc.py index 566faa1..d04fac9 100644 --- a/crawler/store/brodokomerc.py +++ b/crawler/store/brodokomerc.py @@ -1,14 +1,15 @@ import datetime import logging import re -from typing import List, Dict, Any, Optional +from typing import Any, Dict, List, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup -from .base import BaseCrawler from crawler.store.models import Store +from .base import BaseCrawler + logger = logging.getLogger(__name__) diff --git a/crawler/store/dm.py b/crawler/store/dm.py index 480d781..50bcfc4 100644 --- a/crawler/store/dm.py +++ b/crawler/store/dm.py @@ -7,6 +7,7 @@ from typing import Any, List import openpyxl + from crawler.store.models import Product, Store from .base import BaseCrawler diff --git a/crawler/store/eurospin.py b/crawler/store/eurospin.py index e54f912..d38ca8c 100644 --- a/crawler/store/eurospin.py +++ b/crawler/store/eurospin.py @@ -4,6 +4,7 @@ from typing import List from bs4 import BeautifulSoup + from crawler.store.models import Product, Store from .base import BaseCrawler diff --git a/crawler/store/jadranka_trgovina.py b/crawler/store/jadranka_trgovina.py new file mode 100644 index 0000000..b53ed7b --- /dev/null +++ b/crawler/store/jadranka_trgovina.py @@ -0,0 +1,200 @@ +import datetime +import logging +import re + +from bs4 import BeautifulSoup +from crawler.store.models import Product, Store + +from .base import BaseCrawler + +logger = logging.getLogger(__name__) + + +class JadrankaTrgovinaCrawler(BaseCrawler): + """ + Crawler for Jadranka Trgovina store prices. + + Jadranka Trgovina publishes daily CSV price lists for a single store location + (Market Maxi Dražica 5, Mali Lošinj). Files follow the pattern: + MARKET_MAXI_DRAZICA5_MALILOSINJ_607_DDMMYYYY_0800.csv + """ + + CHAIN = "jadranka_trgovina" + BASE_URL = "https://jadranka-trgovina.com" + INDEX_URL = "https://jadranka-trgovina.com/cjenici/" + + # Regex to match CSV filenames and extract date + # Format: MARKET_MAXI_DRAZICA5_MALILOSINJ_607_DDMMYYYY_0800.csv + CSV_FILENAME_PATTERN = re.compile( + r"MARKET_MAXI_DRAZICA5_MALILOSINJ_607_(\d{2})(\d{2})(\d{4})_0800\.csv" + ) + + # Mapping for price fields from CSV columns + # CSV columns in Croatian: + # NAZIV PROIZVODA, ŠIFRA PROIZVODA, MARKA PROIZVODA, NETO KOLIČINA, + # JEDINICA MJERE, MALOPRODAJNA CIJENA, CIJENA ZA JEDINICU MJERE, + # MPC ZA VRIJEME POSEBNOG OBLIKA PRODAJE, NAJNIŽA CIJENA U POSLJEDNIH 30 DANA, + # SIDRENA CIJENA NA 2.5.2025, BARKOD, KATEGORIJA PROIZVODA + PRICE_MAP = { + # field: (column_name, is_required) + # Note: Many products have empty retail price but filled special price + # Some products also have empty unit_price + "price": ("MALOPRODAJNA CIJENA", False), + "unit_price": ("CIJENA ZA JEDINICU MJERE", False), + "special_price": ("MPC ZA VRIJEME POSEBNOG OBLIKA PRODAJE", False), + "best_price_30": ("NAJNIŽA CIJENA U POSLJEDNIH 30 DANA", False), + "anchor_price": ("SIDRENA CIJENA NA 2.5.2025", False), + } + + # Mapping for other product fields from CSV columns + FIELD_MAP = { + "product_id": ("ŠIFRA PROIZVODA", True), + "product": ("NAZIV PROIZVODA", True), + "brand": ("MARKA PROIZVODA", False), + "barcode": ("BARKOD", False), + "category": ("KATEGORIJA PROIZVODA", False), + "quantity": ("NETO KOLIČINA", False), + "unit": ("JEDINICA MJERE", False), + } + + def parse_index(self, content: str) -> list[str]: + """ + Parse the Jadranka Trgovina index page to extract CSV links. + + Args: + content: HTML content of the index page + + Returns: + List of absolute CSV URLs found on the page + """ + soup = BeautifulSoup(content, "html.parser") + urls = [] + + # Find all links ending with .csv + for link_tag in soup.select('a[href$=".csv"]'): + href = str(link_tag.get("href")) + # Make absolute URL if needed + if not href.startswith("http"): + href = ( + f"{self.BASE_URL}{href}" + if href.startswith("/") + else f"{self.BASE_URL}/{href}" + ) + urls.append(href) + + return urls + + def get_index(self, date: datetime.date) -> str | None: + """ + Fetch the index page and find the CSV URL for the specified date. + + Args: + date: The date for which to fetch the price list + + Returns: + CSV URL for the specified date, or None if not found + """ + content = self.fetch_text(self.INDEX_URL) + if not content: + logger.warning( + f"No content found at Jadranka Trgovina index URL: {self.INDEX_URL}" + ) + return None + + urls = self.parse_index(content) + + # Format date as DDMMYYYY to match filename pattern + date_str = f"{date.day:02d}{date.month:02d}{date.year}" + + # Find URL matching the requested date + for url in urls: + if date_str in url: + logger.info(f"Found Jadranka Trgovina CSV for {date}: {url}") + return url + + logger.warning(f"No Jadranka Trgovina CSV found for date {date}") + return None + + def parse_store_info(self) -> Store: + """ + Create store information for the single Jadranka Trgovina location. + + Jadranka Trgovina only has one location that publishes prices: + Market Maxi Dražica 5, Mali Lošinj (Store ID: 607) + + Returns: + Store object with the fixed store information + """ + return Store( + chain=self.CHAIN, + store_id="607", + name="Jadranka Trgovina Market Maxi", + store_type="market", + city="Mali Lošinj", + street_address="Dražica 5", + zipcode="", + items=[], + ) + + def get_store_prices(self, csv_url: str) -> list[Product]: + """ + Fetch and parse store prices from a CSV URL. + + Args: + csv_url: URL to the CSV file containing prices + + Returns: + List of Product objects parsed from the CSV + """ + try: + content = self.fetch_text(csv_url, encodings=["windows-1250", "utf-8"]) + return self.parse_csv(content, delimiter=";") + except Exception as e: + logger.error( + f"Failed to get Jadranka Trgovina prices from {csv_url}: {e}", + exc_info=True, + ) + return [] + + def get_all_products(self, date: datetime.date) -> list[Store]: + """ + Main method to fetch and parse Jadranka Trgovina store and price data. + + Args: + date: The date for which to fetch price data + + Returns: + List containing a single Store object with products, or empty list if unavailable + """ + csv_url = self.get_index(date) + + if not csv_url: + logger.warning(f"No Jadranka Trgovina data available for {date}") + return [] + + try: + store = self.parse_store_info() + products = self.get_store_prices(csv_url) + except Exception as e: + logger.error(f"Error processing Jadranka Trgovina: {e}", exc_info=True) + return [] + + if not products: + logger.warning("No products found for Jadranka Trgovina") + return [] + + store.items = products + logger.info(f"Jadranka Trgovina: {len(products)} products found") + return [store] + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + crawler = JadrankaTrgovinaCrawler() + stores = crawler.crawl(datetime.date.today()) + if stores: + print(stores[0]) + if stores[0].items: + print(stores[0].items[0]) + else: + print("No stores found") diff --git a/crawler/store/konzum.py b/crawler/store/konzum.py index 6649688..6ba1fc7 100644 --- a/crawler/store/konzum.py +++ b/crawler/store/konzum.py @@ -1,10 +1,11 @@ import datetime import logging -import urllib.parse import re +import urllib.parse from typing import List from bs4 import BeautifulSoup + from crawler.store.models import Product, Store from .base import BaseCrawler diff --git a/crawler/store/lidl.py b/crawler/store/lidl.py index 515ec5e..91f3ef4 100644 --- a/crawler/store/lidl.py +++ b/crawler/store/lidl.py @@ -1,11 +1,11 @@ import datetime import logging -from typing import Optional import re +from typing import Optional +from crawler.store.models import Product, Store from .base import BaseCrawler -from crawler.store.models import Store, Product logger = logging.getLogger(__name__) diff --git a/crawler/store/ntl.py b/crawler/store/ntl.py index f92017b..a52ffd0 100644 --- a/crawler/store/ntl.py +++ b/crawler/store/ntl.py @@ -2,9 +2,10 @@ import logging import os import re -from urllib.parse import unquote, quote_plus +from urllib.parse import quote_plus, unquote from bs4 import BeautifulSoup + from crawler.store.models import Product, Store from .base import BaseCrawler diff --git a/crawler/store/spar.py b/crawler/store/spar.py index 8f03b23..e000494 100644 --- a/crawler/store/spar.py +++ b/crawler/store/spar.py @@ -1,9 +1,8 @@ import datetime import logging import re -from typing import Optional from json import loads - +from typing import Optional from crawler.store.models import Store diff --git a/crawler/store/studenac.py b/crawler/store/studenac.py index ca5ec03..01bc5a0 100644 --- a/crawler/store/studenac.py +++ b/crawler/store/studenac.py @@ -1,8 +1,8 @@ import datetime import logging -from pathlib import Path import re import subprocess +from pathlib import Path from tempfile import TemporaryDirectory from typing import Generator, Optional, Tuple diff --git a/crawler/store/trgovina_krk.py b/crawler/store/trgovina_krk.py index 0ff9a73..58ae616 100644 --- a/crawler/store/trgovina_krk.py +++ b/crawler/store/trgovina_krk.py @@ -1,13 +1,14 @@ import datetime import logging import re -from typing import Optional, List, Dict, Any +from typing import Any, Dict, List, Optional from bs4 import BeautifulSoup -from .base import BaseCrawler from crawler.store.models import Store +from .base import BaseCrawler + logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 1415108..ebf1901 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,8 @@ dependencies = [ "uvicorn>=0.34.2", ] -[tool.uv] -dev-dependencies = [ +[dependency-groups] +dev = [ "pre-commit>=4.2.0", "pyright>=1.1.400", "ruff>=0.11.10", diff --git a/service/config.py b/service/config.py index 45c445b..16651f2 100644 --- a/service/config.py +++ b/service/config.py @@ -1,8 +1,8 @@ import os -from dotenv import load_dotenv - from typing import TYPE_CHECKING +from dotenv import load_dotenv + if TYPE_CHECKING: from service.db.base import Database diff --git a/service/db/base.py b/service/db/base.py index 063cc9c..675cecc 100644 --- a/service/db/base.py +++ b/service/db/base.py @@ -4,16 +4,16 @@ from .models import ( Chain, + ChainProduct, + ChainProductWithId, ChainStats, ChainWithId, + Price, Product, ProductWithId, Store, - ChainProduct, - Price, StorePrice, StoreWithId, - ChainProductWithId, User, ) diff --git a/service/db/enrich.py b/service/db/enrich.py index 3e55599..ac8bfef 100644 --- a/service/db/enrich.py +++ b/service/db/enrich.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -import asyncio import argparse +import asyncio import logging +from csv import DictReader from decimal import Decimal from pathlib import Path -from csv import DictReader from time import time -from typing import List, Dict +from typing import Dict, List from service.config import settings from service.db.models import Product diff --git a/service/db/models.py b/service/db/models.py index 8227a21..4af7a6e 100644 --- a/service/db/models.py +++ b/service/db/models.py @@ -1,8 +1,7 @@ -from typing import Optional +from dataclasses import dataclass, fields from datetime import date, datetime from decimal import Decimal - -from dataclasses import dataclass, fields +from typing import Optional @dataclass(frozen=True, slots=True, kw_only=True) diff --git a/service/db/psql.py b/service/db/psql.py index 741d4d5..3f48f4d 100644 --- a/service/db/psql.py +++ b/service/db/psql.py @@ -1,27 +1,29 @@ +import logging +import os from contextlib import asynccontextmanager -import asyncpg +from datetime import date from typing import ( + Any, AsyncGenerator, AsyncIterator, List, - Any, ) -import logging -import os -from datetime import date + +import asyncpg + from .base import Database from .models import ( Chain, + ChainProduct, + ChainProductWithId, ChainStats, ChainWithId, + Price, Product, ProductWithId, Store, - ChainProduct, - Price, StorePrice, StoreWithId, - ChainProductWithId, User, ) diff --git a/service/routers/v1.py b/service/routers/v1.py index 406b936..8856ee7 100644 --- a/service/routers/v1.py +++ b/service/routers/v1.py @@ -1,7 +1,8 @@ +import datetime from decimal import Decimal + from fastapi import APIRouter, HTTPException, Query from pydantic import BaseModel, Field -import datetime from service.config import settings from service.db.models import ChainStats, ProductWithId, StorePrice