evuventures · johnnvij · Apr 28, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 25, 2025
diff --git a/README.md b/README.md
@@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/
 
 -To run the scraper, execute the main.py script by running the command 
 
-python main.py
+python src/main.py
 
--Make sure you are in the src directory when you run the command  (the directory that contains main.py).
+-Make sure you are in the webscraper directory when you run the command  
 
 ##Where is the entry point?
 

diff --git a/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc b/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc
diff --git a/webscraper/output.json b/webscraper/output.json
@@ -0,0 +1,24 @@
+{
+  "/": [
+    "A Light in the Attic",
+    "Tipping the Velvet",
+    "Soumission",
+    "Sharp Objects",
+    "Sapiens: A Brief History of Humankind",
+    "The Requiem Red",
+    "The Dirty Little Secrets of Getting Your Dream Job",
+    "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
+    "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
+    "The Black Maria",
+    "Starving Hearts (Triangular Trade Trilogy, #1)",
+    "Shakespeare's Sonnets",
+    "Set Me Free",
+    "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
+    "Rip it Up and Start Again",
+    "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
+    "Olio",
+    "Mesaerion: The Best Science Fiction Stories 1800-1849",
+    "Libertarianism for Beginners",
+    "It's Only the Himalayas"
+  ]
+}
diff --git a/webscraper/src/Cheaper_Scraper.py b/webscraper/src/Cheaper_Scraper.py
@@ -1,39 +1,54 @@
 import requests
 import time
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
 import logging
 from typing import Dict, List, Optional
+# i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.fetch_utils import cached_get
 from ABC.base_scraper import BaseScraper
-from Robot_Check import RoboCheck
+from src.robot_check import RoboCheck
+from functools import lru_cache
 
 
 class CheaperScraper(BaseScraper):
     def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:
         """Initialize the scraper with base parameters.
-        
+
         Args:
             base_url: The base URL to scrape
             user_agent: User agent string to identify the scraper
             delay: Time in seconds to wait between requests
         """
+        parsed_url = urlparse(base_url)
+        if not parsed_url.scheme or not parsed_url.netloc:
+            raise ValueError(f"Invalid base URL: {base_url}")
 
         self.base_url = base_url.rstrip('/')
         self.delay = delay
         self.user_agent = user_agent
 
+
         #initialize session
         self.session = requests.Session()
         self.session.headers.update({"User-Agent": self.user_agent})
 
-        # robot logic checks if there are instances not able to be 
+
+        # robot logic checks if there are instances not able to be
         self.robots = RoboCheck(base_url, user_agent)
 
+
+
+
     def fetch(self, path: str = "/") -> Optional[str]:
         """Fetch content from a specific path.
-        
+
         Args:
             path: The URL path to fetch
-            
+
         Returns:
             HTML content as string if successful, None otherwise
         """
@@ -43,42 +58,42 @@ def fetch(self, path: str = "/") -> Optional[str]:
             return None
 
         url = self.base_url + path
-
-        try:
-            response = self.session.get(url, timeout=10)
-            response.raise_for_status()
-            time.sleep(self.delay)  # delay to simulate a user
-            return response.text
-        except requests.RequestException as e:
-            logging.error(f"Error fetching {url}: {e}")
-            return None
-
+        cached_before = cached_get.cache_info().hits
+        html = cached_get(url, self.user_agent)
+        cached_after = cached_get.cache_info().hits
+
+        if cached_after == cached_before:
+            time.sleep(self.delay)
+
+        return html
+
     def parse(self, html: str) -> List[str]:
         """Parse HTML content.
-        
+
         Args:
             html: The HTML content to parse
-            
+
         Returns:
             List of parsed items from the HTML
         """
         soup = BeautifulSoup(html, "html.parser")
         results = []
-    
+
         for book in soup.find_all("article", class_="product_pod"):
             title = book.h3.a["title"]
             results.append(title)
-    
+
         return results
-
-
+
+
+
 
     def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
         """Scrape multiple paths.
-        
+
         Args:
             paths: List of URL paths to scrape
-            
+
         Returns:
             Dictionary mapping paths to their parsed results
         """
@@ -88,4 +103,5 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
             html = self.fetch(path)
             if html:
                 results[path] = self.parse(html)
-        return results
+        return results
+
diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/__init__.cpython-39.pyc b/webscraper/src/__pycache__/__init__.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/fetch_utils.cpython-39.pyc b/webscraper/src/__pycache__/fetch_utils.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/main.cpython-39.pyc b/webscraper/src/__pycache__/main.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/robot_check.cpython-39.pyc b/webscraper/src/__pycache__/robot_check.cpython-39.pyc
diff --git a/webscraper/src/fetch_utils.py b/webscraper/src/fetch_utils.py
@@ -0,0 +1,21 @@
+import requests
+import logging
+from functools import lru_cache
+from typing import Optional
+
+
+@lru_cache(maxsize=128)
+def cached_get(url: str, user_agent: str) -> Optional[str]:
+    print(f"[HTTP Request] Fetching from web: {url}")
+    headers = {"User-Agent": user_agent}
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None
+
+
+
+
diff --git a/webscraper/src/main.py b/webscraper/src/main.py
@@ -1,6 +1,12 @@
 
-from Cheaper_Scraper import CheaperScraper
 import json
+#import time // for testing
+# i added htese imports below becasue when i ran it it wasnt finding the folders
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.Cheaper_Scraper import CheaperScraper
+
 
 def main():
     # Set up the scraper for a simple legal-to-scrape website
@@ -26,3 +32,5 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc b/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc
diff --git a/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc b/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc
diff --git a/webscraper/src/tests/test_fetch_and_cache.py b/webscraper/src/tests/test_fetch_and_cache.py
@@ -0,0 +1,74 @@
+import unittest
+import time
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from src.fetch_utils import cached_get
+
+from src.Cheaper_Scraper import CheaperScraper
+
+#to test, be in the webscraper directory and use the following command in terminal
+# python src/tests/test_fetch_and_cache.py -v
+
+
+class TestCheaperScraperFetchCache(unittest.TestCase):
+
+    def setUp(self):
+        self.scraper = CheaperScraper("https://books.toscrape.com")
+        cached_get.cache_clear()  # Reset cache before each test
+
+    def test_valid_fetch(self):
+        html = self.scraper.fetch("/")
+        self.assertIsInstance(html, str)
+        self.assertIn("<html", html.lower())
+
+    def test_invalid_path_fetch(self):
+        html = self.scraper.fetch("/this-page-does-not-exist")
+        # Even though it doesn't exist, the site may return a 200 with a 404 page
+        self.assertTrue(html is None or "<html" in html.lower())
+
+    def test_cache_effectiveness(self):
+        start = time.time()
+        self.scraper.fetch("/")  # First fetch
+        time1 = time.time() - start
+
+        start = time.time()
+        self.scraper.fetch("/")  # Second fetch (should be cached)
+        time2 = time.time() - start
+
+        cache_info = cached_get.cache_info()
+        self.assertLess(time2, time1)
+        self.assertGreaterEqual(cache_info.hits, 1)
+
+    def test_non_http_url(self):
+        with self.assertRaises(ValueError):
+            CheaperScraper("not_a_real_url")
+
+    def test_cache_timing_and_stats(self):
+        print("\n=== Cache Timing and Stats Test ===")
+
+        # First fetch (expected to be slow and hit the network)
+        start = time.time()
+        html1 = self.scraper.fetch("/")
+        time1 = round(time.time() - start, 2)
+        print(f"First fetch took: {time1} seconds")
+
+        # Second fetch (expected to be fast due to cache)
+        start = time.time()
+        html2 = self.scraper.fetch("/")
+        time2 = round(time.time() - start, 2)
+        print(f"Second fetch took: {time2} seconds")
+
+        # Confirm that the second fetch was faster
+        self.assertLess(time2, time1, "Second fetch should be faster due to caching")
+
+        # Print and assert cache stats
+        stats = cached_get.cache_info()
+        print("Cache stats:", stats)
+        self.assertGreaterEqual(stats.hits, 1, "There should be at least 1 cache hit")
+
+
+
+
+if __name__ == "__main__":
+    unittest.main()