Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/

-To run the scraper, execute the main.py script by running the command

python main.py
python src/main.py

-Make sure you are in the src directory when you run the command (the directory that contains main.py).
-Make sure you are in the webscraper directory when you run the command

##Where is the entry point?

Expand Down
Binary file not shown.
24 changes: 24 additions & 0 deletions webscraper/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"/": [
"A Light in the Attic",
"Tipping the Velvet",
"Soumission",
"Sharp Objects",
"Sapiens: A Brief History of Humankind",
"The Requiem Red",
"The Dirty Little Secrets of Getting Your Dream Job",
"The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"The Black Maria",
"Starving Hearts (Triangular Trade Trilogy, #1)",
"Shakespeare's Sonnets",
"Set Me Free",
"Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
"Rip it Up and Start Again",
"Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"Olio",
"Mesaerion: The Best Science Fiction Stories 1800-1849",
"Libertarianism for Beginners",
"It's Only the Himalayas"
]
}
75 changes: 50 additions & 25 deletions webscraper/src/Cheaper_Scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,60 @@
from bs4 import BeautifulSoup
import logging
from typing import Dict, List, Optional
# i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from ABC.base_scraper import BaseScraper
from Robot_Check import RoboCheck
from robot_check import RoboCheck
from functools import lru_cache


class CheaperScraper(BaseScraper):
def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:
"""Initialize the scraper with base parameters.

Args:
base_url: The base URL to scrape
user_agent: User agent string to identify the scraper
delay: Time in seconds to wait between requests
"""

self.base_url = base_url.rstrip('/')
self.delay = delay
self.user_agent = user_agent


#initialize session
self.session = requests.Session()
self.session.headers.update({"User-Agent": self.user_agent})

# robot logic checks if there are instances not able to be

# robot logic checks if there are instances not able to be
self.robots = RoboCheck(base_url, user_agent)


@staticmethod
@lru_cache(maxsize=128) # cache up to 128 unique URLs
def _cached_get(url: str, user_agent: str) -> Optional[str]:
print(f"[HTTP Request] Fetching from web: {url}") # <== ADD THIS
headers = {"User-Agent": user_agent}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None


def fetch(self, path: str = "/") -> Optional[str]:
"""Fetch content from a specific path.

Args:
path: The URL path to fetch

Returns:
HTML content as string if successful, None otherwise
"""
Expand All @@ -42,43 +65,44 @@ def fetch(self, path: str = "/") -> Optional[str]:
logging.warning(f"Disallowed by robots.txt: {path}")
return None


url = self.base_url + path

try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
time.sleep(self.delay) # delay to simulate a user
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None

cached_before = self._cached_get.cache_info().hits
html = self._cached_get(url, self.user_agent)
cached_after = self._cached_get.cache_info().hits

if cached_after == cached_before: # No cache hit, so it was fetched
time.sleep(self.delay)

return html

def parse(self, html: str) -> List[str]:
"""Parse HTML content.

Args:
html: The HTML content to parse

Returns:
List of parsed items from the HTML
"""
soup = BeautifulSoup(html, "html.parser")
results = []

for book in soup.find_all("article", class_="product_pod"):
title = book.h3.a["title"]
results.append(title)

return results






def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
"""Scrape multiple paths.

Args:
paths: List of URL paths to scrape

Returns:
Dictionary mapping paths to their parsed results
"""
Expand All @@ -88,4 +112,5 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
html = self.fetch(path)
if html:
results[path] = self.parse(html)
return results
return results

Binary file modified webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc
Binary file not shown.
Binary file added webscraper/src/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file added webscraper/src/__pycache__/main.cpython-39.pyc
Binary file not shown.
Binary file modified webscraper/src/__pycache__/robot_check.cpython-39.pyc
Binary file not shown.
29 changes: 28 additions & 1 deletion webscraper/src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@

from Cheaper_Scraper import CheaperScraper
import json
#import time // for testing
# i added htese imports below becasue when i ran it it wasnt finding the folders
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.Cheaper_Scraper import CheaperScraper


def main():
# Set up the scraper for a simple legal-to-scrape website
Expand All @@ -26,3 +32,24 @@ def main():

if __name__ == "__main__":
main()




# For testing cache
# def main():
# scraper = CheaperScraper("https://books.toscrape.com")

# print("=== First Request ===")
# start = time.time()
# html1 = scraper.fetch("/") # should print: [HTTP Request] ...
# print("Time taken:", round(time.time() - start, 2), "seconds\n")

# print("=== Second Request (Should Be Cached) ===")
# start = time.time()
# html2 = scraper.fetch("/") # should NOT print: [HTTP Request] ...
# print("Time taken:", round(time.time() - start, 2), "seconds\n")
# print("Cache stats:", scraper._cached_get.cache_info())

# if __name__ == "__main__":
# main()