Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/

-To run the scraper, execute the main.py script by running the command

python main.py
python src/main.py

-Make sure you are in the src directory when you run the command (the directory that contains main.py).
-Make sure you are in the webscraper directory when you run the command

##Where is the entry point?

Expand Down
Binary file not shown.
24 changes: 24 additions & 0 deletions webscraper/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"/": [
"A Light in the Attic",
"Tipping the Velvet",
"Soumission",
"Sharp Objects",
"Sapiens: A Brief History of Humankind",
"The Requiem Red",
"The Dirty Little Secrets of Getting Your Dream Job",
"The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"The Black Maria",
"Starving Hearts (Triangular Trade Trilogy, #1)",
"Shakespeare's Sonnets",
"Set Me Free",
"Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
"Rip it Up and Start Again",
"Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"Olio",
"Mesaerion: The Best Science Fiction Stories 1800-1849",
"Libertarianism for Beginners",
"It's Only the Himalayas"
]
}
64 changes: 40 additions & 24 deletions webscraper/src/Cheaper_Scraper.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,54 @@
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import logging
from typing import Dict, List, Optional
# i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.fetch_utils import cached_get
from ABC.base_scraper import BaseScraper
from Robot_Check import RoboCheck
from src.robot_check import RoboCheck
from functools import lru_cache


class CheaperScraper(BaseScraper):
def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:
"""Initialize the scraper with base parameters.

Args:
base_url: The base URL to scrape
user_agent: User agent string to identify the scraper
delay: Time in seconds to wait between requests
"""
parsed_url = urlparse(base_url)
if not parsed_url.scheme or not parsed_url.netloc:
raise ValueError(f"Invalid base URL: {base_url}")

self.base_url = base_url.rstrip('/')
self.delay = delay
self.user_agent = user_agent


#initialize session
self.session = requests.Session()
self.session.headers.update({"User-Agent": self.user_agent})

# robot logic checks if there are instances not able to be

# robot logic checks if there are instances not able to be
self.robots = RoboCheck(base_url, user_agent)




def fetch(self, path: str = "/") -> Optional[str]:
"""Fetch content from a specific path.

Args:
path: The URL path to fetch

Returns:
HTML content as string if successful, None otherwise
"""
Expand All @@ -43,42 +58,42 @@ def fetch(self, path: str = "/") -> Optional[str]:
return None

url = self.base_url + path

try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
time.sleep(self.delay) # delay to simulate a user
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None

cached_before = cached_get.cache_info().hits
html = cached_get(url, self.user_agent)
cached_after = cached_get.cache_info().hits

if cached_after == cached_before:
time.sleep(self.delay)

return html

def parse(self, html: str) -> List[str]:
"""Parse HTML content.

Args:
html: The HTML content to parse

Returns:
List of parsed items from the HTML
"""
soup = BeautifulSoup(html, "html.parser")
results = []

for book in soup.find_all("article", class_="product_pod"):
title = book.h3.a["title"]
results.append(title)

return results






def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
"""Scrape multiple paths.

Args:
paths: List of URL paths to scrape

Returns:
Dictionary mapping paths to their parsed results
"""
Expand All @@ -88,4 +103,5 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
html = self.fetch(path)
if html:
results[path] = self.parse(html)
return results
return results

Binary file modified webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc
Binary file not shown.
Binary file added webscraper/src/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file not shown.
Binary file added webscraper/src/__pycache__/main.cpython-39.pyc
Binary file not shown.
Binary file modified webscraper/src/__pycache__/robot_check.cpython-39.pyc
Binary file not shown.
21 changes: 21 additions & 0 deletions webscraper/src/fetch_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import requests
import logging
from functools import lru_cache
from typing import Optional


@lru_cache(maxsize=128)
def cached_get(url: str, user_agent: str) -> Optional[str]:
print(f"[HTTP Request] Fetching from web: {url}")
headers = {"User-Agent": user_agent}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None




10 changes: 9 additions & 1 deletion webscraper/src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@

from Cheaper_Scraper import CheaperScraper
import json
#import time // for testing
# i added htese imports below becasue when i ran it it wasnt finding the folders
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.Cheaper_Scraper import CheaperScraper


def main():
# Set up the scraper for a simple legal-to-scrape website
Expand All @@ -26,3 +32,5 @@ def main():

if __name__ == "__main__":
main()


Binary file not shown.
Binary file not shown.
74 changes: 74 additions & 0 deletions webscraper/src/tests/test_fetch_and_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import unittest
import time
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from src.fetch_utils import cached_get

from src.Cheaper_Scraper import CheaperScraper

#to test, be in the webscraper directory and use the following command in terminal
# python src/tests/test_fetch_and_cache.py -v


class TestCheaperScraperFetchCache(unittest.TestCase):

def setUp(self):
self.scraper = CheaperScraper("https://books.toscrape.com")
cached_get.cache_clear() # Reset cache before each test

def test_valid_fetch(self):
html = self.scraper.fetch("/")
self.assertIsInstance(html, str)
self.assertIn("<html", html.lower())

def test_invalid_path_fetch(self):
html = self.scraper.fetch("/this-page-does-not-exist")
# Even though it doesn't exist, the site may return a 200 with a 404 page
self.assertTrue(html is None or "<html" in html.lower())

def test_cache_effectiveness(self):
start = time.time()
self.scraper.fetch("/") # First fetch
time1 = time.time() - start

start = time.time()
self.scraper.fetch("/") # Second fetch (should be cached)
time2 = time.time() - start

cache_info = cached_get.cache_info()
self.assertLess(time2, time1)
self.assertGreaterEqual(cache_info.hits, 1)

def test_non_http_url(self):
with self.assertRaises(ValueError):
CheaperScraper("not_a_real_url")

def test_cache_timing_and_stats(self):
print("\n=== Cache Timing and Stats Test ===")

# First fetch (expected to be slow and hit the network)
start = time.time()
html1 = self.scraper.fetch("/")
time1 = round(time.time() - start, 2)
print(f"First fetch took: {time1} seconds")

# Second fetch (expected to be fast due to cache)
start = time.time()
html2 = self.scraper.fetch("/")
time2 = round(time.time() - start, 2)
print(f"Second fetch took: {time2} seconds")

# Confirm that the second fetch was faster
self.assertLess(time2, time1, "Second fetch should be faster due to caching")

# Print and assert cache stats
stats = cached_get.cache_info()
print("Cache stats:", stats)
self.assertGreaterEqual(stats.hits, 1, "There should be at least 1 cache hit")




if __name__ == "__main__":
unittest.main()