evuventures · johnnvij · Apr 28, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 25, 2025
diff --git a/README.md b/README.md
@@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/
 
 -To run the scraper, execute the main.py script by running the command 
 
-python main.py
+python src/main.py
 
--Make sure you are in the src directory when you run the command  (the directory that contains main.py).
+-Make sure you are in the webscraper directory when you run the command  
 
 ##Where is the entry point?
 

diff --git a/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc b/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc
diff --git a/webscraper/output.json b/webscraper/output.json
@@ -0,0 +1,24 @@
+{
+  "/": [
+    "A Light in the Attic",
+    "Tipping the Velvet",
+    "Soumission",
+    "Sharp Objects",
+    "Sapiens: A Brief History of Humankind",
+    "The Requiem Red",
+    "The Dirty Little Secrets of Getting Your Dream Job",
+    "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
+    "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
+    "The Black Maria",
+    "Starving Hearts (Triangular Trade Trilogy, #1)",
+    "Shakespeare's Sonnets",
+    "Set Me Free",
+    "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
+    "Rip it Up and Start Again",
+    "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
+    "Olio",
+    "Mesaerion: The Best Science Fiction Stories 1800-1849",
+    "Libertarianism for Beginners",
+    "It's Only the Himalayas"
+  ]
+}
diff --git a/webscraper/src/Cheaper_Scraper.py b/webscraper/src/Cheaper_Scraper.py
@@ -3,37 +3,60 @@
 from bs4 import BeautifulSoup
 import logging
 from typing import Dict, List, Optional
+# i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
 from ABC.base_scraper import BaseScraper
-from Robot_Check import RoboCheck
+from robot_check import RoboCheck
+from functools import lru_cache
 
 
 class CheaperScraper(BaseScraper):
     def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:
         """Initialize the scraper with base parameters.
-        
+
         Args:
             base_url: The base URL to scrape
             user_agent: User agent string to identify the scraper
             delay: Time in seconds to wait between requests
         """
-        
+
         self.base_url = base_url.rstrip('/')
         self.delay = delay
         self.user_agent = user_agent
 
+
         #initialize session
         self.session = requests.Session()
         self.session.headers.update({"User-Agent": self.user_agent})
 
-        # robot logic checks if there are instances not able to be 
+
+        # robot logic checks if there are instances not able to be
         self.robots = RoboCheck(base_url, user_agent)
 
+
+    @staticmethod
+    @lru_cache(maxsize=128)  # cache up to 128 unique URLs
+    def _cached_get(url: str, user_agent: str) -> Optional[str]:
+        print(f"[HTTP Request] Fetching from web: {url}")  # <== ADD THIS
+        headers = {"User-Agent": user_agent}
+        try:
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            return response.text
+        except requests.RequestException as e:
+            logging.error(f"Error fetching {url}: {e}")
+            return None
+
+
     def fetch(self, path: str = "/") -> Optional[str]:
         """Fetch content from a specific path.
-        
+
         Args:
             path: The URL path to fetch
-            
+
         Returns:
             HTML content as string if successful, None otherwise
         """
@@ -42,43 +65,44 @@ def fetch(self, path: str = "/") -> Optional[str]:
             logging.warning(f"Disallowed by robots.txt: {path}")
             return None
 
+
         url = self.base_url + path
-
-        try:
-            response = self.session.get(url, timeout=10)
-            response.raise_for_status()
-            time.sleep(self.delay)  # delay to simulate a user
-            return response.text
-        except requests.RequestException as e:
-            logging.error(f"Error fetching {url}: {e}")
-            return None
-
+        cached_before = self._cached_get.cache_info().hits
+        html = self._cached_get(url, self.user_agent)
+        cached_after = self._cached_get.cache_info().hits
+
+        if cached_after == cached_before:  # No cache hit, so it was fetched
+            time.sleep(self.delay)
+
+        return html
+
     def parse(self, html: str) -> List[str]:
         """Parse HTML content.
-        
+
         Args:
             html: The HTML content to parse
-            
+
         Returns:
             List of parsed items from the HTML
         """
         soup = BeautifulSoup(html, "html.parser")
         results = []
-    
+
         for book in soup.find_all("article", class_="product_pod"):
             title = book.h3.a["title"]
             results.append(title)
-    
+
         return results
-
-
+
+
+
 
     def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
         """Scrape multiple paths.
-        
+
         Args:
             paths: List of URL paths to scrape
-            
+
         Returns:
             Dictionary mapping paths to their parsed results
         """
@@ -88,4 +112,5 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
             html = self.fetch(path)
             if html:
                 results[path] = self.parse(html)
-        return results
+        return results
+
diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/__init__.cpython-39.pyc b/webscraper/src/__pycache__/__init__.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/main.cpython-39.pyc b/webscraper/src/__pycache__/main.cpython-39.pyc
diff --git a/webscraper/src/__pycache__/robot_check.cpython-39.pyc b/webscraper/src/__pycache__/robot_check.cpython-39.pyc
diff --git a/webscraper/src/main.py b/webscraper/src/main.py
@@ -1,6 +1,12 @@
 
-from Cheaper_Scraper import CheaperScraper
 import json
+#import time // for testing
+# i added htese imports below becasue when i ran it it wasnt finding the folders
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.Cheaper_Scraper import CheaperScraper
+
 
 def main():
     # Set up the scraper for a simple legal-to-scrape website
@@ -26,3 +32,24 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
+
+
+# For testing cache
+# def main():
+#     scraper = CheaperScraper("https://books.toscrape.com")
+
+#     print("=== First Request ===")
+#     start = time.time()
+#     html1 = scraper.fetch("/")  # should print: [HTTP Request] ...
+#     print("Time taken:", round(time.time() - start, 2), "seconds\n")
+
+#     print("=== Second Request (Should Be Cached) ===")
+#     start = time.time()
+#     html2 = scraper.fetch("/")  # should NOT print: [HTTP Request] ...
+#     print("Time taken:", round(time.time() - start, 2), "seconds\n")
+#     print("Cache stats:", scraper._cached_get.cache_info())
+
+# if __name__ == "__main__":
+#     main()