diff --git a/data/AAPL.csv b/data/AAPL.csv new file mode 100644 index 0000000000..622ae0a30e --- /dev/null +++ b/data/AAPL.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,254.6699981689453,259.239990234375,253.9499969482422,258.0199890136719,258.0199890136719,49155600 +2025-10-06,257.989990234375,259.07000732421875,255.0500030517578,256.69000244140625,256.69000244140625,44664100 +2025-10-07,256.80999755859375,257.3999938964844,255.42999267578125,256.4800109863281,256.4800109863281,31955800 +2025-10-08,256.5199890136719,258.5199890136719,256.1099853515625,258.05999755859375,258.05999755859375,36496900 +2025-10-09,257.80999755859375,258.0,253.13999938964844,254.0399932861328,254.0399932861328,38270200 diff --git a/data/AMZN.csv b/data/AMZN.csv new file mode 100644 index 0000000000..6714a1b2b9 --- /dev/null +++ b/data/AMZN.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,223.44000244140625,224.1999969482422,219.33999633789062,219.50999450683594,219.50999450683594,43639000 +2025-10-06,221.0,221.72999572753906,216.02999877929688,220.89999389648438,220.89999389648438,43690900 +2025-10-07,220.8800048828125,222.88999938964844,220.1699981689453,221.77999877929688,221.77999877929688,31194700 +2025-10-08,222.9199981689453,226.72999572753906,221.19000244140625,225.22000122070312,225.22000122070312,46686000 +2025-10-09,225.0,228.2100067138672,221.75,227.74000549316406,227.74000549316406,46320900 diff --git a/data/GOOGL.csv b/data/GOOGL.csv new file mode 100644 index 0000000000..53ed2c3907 --- /dev/null +++ b/data/GOOGL.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,244.49000549316406,246.3000030517578,241.66000366210938,245.35000610351562,245.35000610351562,30249600 +2025-10-06,244.77999877929688,251.32000732421875,244.5800018310547,250.42999267578125,250.42999267578125,28894700 +2025-10-07,248.27000427246094,250.44000244140625,245.52000427246094,245.75999450683594,245.75999450683594,23181300 +2025-10-08,244.9600067138672,246.00999450683594,243.82000732421875,244.6199951171875,244.6199951171875,21307100 +2025-10-09,244.47000122070312,244.75999450683594,239.14999389648438,241.52999877929688,241.52999877929688,27828700 diff --git a/data/JPM.csv b/data/JPM.csv new file mode 100644 index 0000000000..a803882e06 --- /dev/null +++ b/data/JPM.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,308.510009765625,311.6600036621094,308.2099914550781,310.0299987792969,308.5299987792969,6029900 +2025-10-06,310.17999267578125,311.75,305.1300048828125,309.17999267578125,309.17999267578125,7214500 +2025-10-07,309.3500061035156,310.010009765625,304.70001220703125,307.69000244140625,307.69000244140625,8454200 +2025-10-08,308.2099914550781,308.7799987792969,303.54998779296875,304.0299987792969,304.0299987792969,6489900 +2025-10-09,305.04998779296875,308.0400085449219,303.3900146484375,305.5299987792969,305.5299987792969,7057800 diff --git a/data/META.csv b/data/META.csv new file mode 100644 index 0000000000..d0399c5511 --- /dev/null +++ b/data/META.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,729.6300048828125,731.0,710.1799926757812,710.5599975585938,710.5599975585938,16154300 +2025-10-06,705.1900024414062,716.8800048828125,690.510009765625,715.6599731445312,715.6599731445312,21654700 +2025-10-07,717.719970703125,718.5,705.75,713.0800170898438,713.0800170898438,12062900 +2025-10-08,713.4500122070312,719.6500244140625,707.8099975585938,717.8400268554688,717.8400268554688,10790600 +2025-10-09,718.280029296875,733.510009765625,712.4400024414062,733.510009765625,733.510009765625,12692100 diff --git a/data/MSFT.csv b/data/MSFT.csv new file mode 100644 index 0000000000..37aa5f0059 --- /dev/null +++ b/data/MSFT.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,517.0999755859375,520.489990234375,515.0,517.3499755859375,517.3499755859375,15112300 +2025-10-06,518.6099853515625,531.030029296875,518.2000122070312,528.5700073242188,528.5700073242188,21388600 +2025-10-07,528.2899780273438,529.7999877929688,521.4400024414062,523.97998046875,523.97998046875,14615200 +2025-10-08,523.280029296875,526.9500122070312,523.0900268554688,524.8499755859375,524.8499755859375,13363400 +2025-10-09,522.3400268554688,524.3300170898438,517.4000244140625,522.4000244140625,522.4000244140625,18324500 diff --git a/data/NVDA.csv b/data/NVDA.csv new file mode 100644 index 0000000000..f621bb8875 --- /dev/null +++ b/data/NVDA.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,189.19000244140625,190.36000061035156,185.3800048828125,187.6199951171875,187.6199951171875,137596900 +2025-10-06,185.5,187.22999572753906,183.3300018310547,185.5399932861328,185.5399932861328,157678100 +2025-10-07,186.22999572753906,189.05999755859375,184.0,185.0399932861328,185.0399932861328,140088000 +2025-10-08,186.57000732421875,189.60000610351562,186.5399932861328,189.11000061035156,189.11000061035156,130168900 +2025-10-09,192.22999572753906,195.3000030517578,191.05999755859375,192.57000732421875,192.57000732421875,182490600 diff --git a/data/TSLA.csv b/data/TSLA.csv new file mode 100644 index 0000000000..dc88cb4765 --- /dev/null +++ b/data/TSLA.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,443.2900085449219,446.7699890136719,416.5799865722656,429.8299865722656,429.8299865722656,133188200 +2025-10-06,440.75,453.54998779296875,436.69000244140625,453.25,453.25,85324900 +2025-10-07,447.82000732421875,452.67999267578125,432.45001220703125,433.0899963378906,433.0899963378906,102296100 +2025-10-08,437.57000732421875,441.3299865722656,425.2300109863281,438.69000244140625,438.69000244140625,71192100 +2025-10-09,431.80999755859375,436.3500061035156,426.17999267578125,435.5400085449219,435.5400085449219,69149400 diff --git a/data/UNH.csv b/data/UNH.csv new file mode 100644 index 0000000000..4a685fe84b --- /dev/null +++ b/data/UNH.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,357.25,368.0,356.6099853515625,360.20001220703125,360.20001220703125,13494800 +2025-10-06,361.0299987792969,362.5199890136719,357.1099853515625,358.7699890136719,358.7699890136719,7181700 +2025-10-07,359.4100036621094,364.7200012207031,358.05999755859375,363.6600036621094,363.6600036621094,7236200 +2025-10-08,366.92999267578125,374.6300048828125,365.79998779296875,369.9200134277344,369.9200134277344,9438300 +2025-10-09,372.5199890136719,376.2200012207031,366.0799865722656,367.69000244140625,367.69000244140625,8728800 diff --git a/data/V.csv b/data/V.csv new file mode 100644 index 0000000000..e785fb5b40 --- /dev/null +++ b/data/V.csv @@ -0,0 +1,6 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2025-10-03,346.1099853515625,353.1600036621094,346.0,349.8399963378906,349.8399963378906,5199700 +2025-10-06,350.010009765625,351.1199951171875,344.3500061035156,349.2799987792969,349.2799987792969,4735600 +2025-10-07,350.05999755859375,354.44000244140625,350.05999755859375,352.4200134277344,352.4200134277344,5044600 +2025-10-08,353.7799987792969,355.0,351.1000061035156,351.3599853515625,351.3599853515625,3927500 +2025-10-09,351.4700012207031,352.8699951171875,345.5799865722656,347.0400085449219,347.0400085449219,4434300 diff --git a/fetch_stocks.py b/fetch_stocks.py new file mode 100644 index 0000000000..d636b1b829 --- /dev/null +++ b/fetch_stocks.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +fetch_stocks.py + +Fetch stock price data for multiple tickers using yfinance while minimizing rate-limit errors. + +Features: +- Batch downloads instead of one-by-one +- 1–2 second randomized delay between batch requests +- Retries with exponential backoff on failures (including probable rate limits) +- Per-ticker CSV caching; only refresh if cache is older than 24 hours +- Errors logged to logs/errors.log + +Usage examples: +- python fetch_stocks.py --tickers AAPL MSFT GOOGL AMZN +- python fetch_stocks.py --tickers-file tickers.txt --batch-size 10 --period 1y --interval 1d + +Dependencies: +- yfinance (pip install yfinance) +- pandas (pip install pandas) + +This script is Windows-friendly and ready for use in GSSOC 2025 projects. +""" +from __future__ import annotations + +import argparse +import os +import sys +import time +import random +import logging +from datetime import datetime, timedelta +from typing import Iterable, List, Sequence + +try: + import pandas as pd +except Exception as e: + print("pandas is required. Install with: pip install pandas", file=sys.stderr) + raise + +try: + import yfinance as yf +except Exception as e: + print("yfinance is required. Install with: pip install yfinance", file=sys.stderr) + raise + + +FRESHNESS_SECONDS = 24 * 60 * 60 # 24 hours + + +def setup_logging(log_dir: str = "logs") -> logging.Logger: + os.makedirs(log_dir, exist_ok=True) + logger = logging.getLogger("stock_fetcher") + logger.setLevel(logging.INFO) + + # Avoid duplicate handlers if script is re-run within same interpreter + if logger.handlers: + return logger + + # Console handler (info and above) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")) + + # Error file handler (errors only) + err_path = os.path.join(log_dir, "errors.log") + fh_err = logging.FileHandler(err_path, mode="a", encoding="utf-8") + fh_err.setLevel(logging.ERROR) + fh_err.setFormatter( + logging.Formatter("%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s") + ) + + logger.addHandler(ch) + logger.addHandler(fh_err) + return logger + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Batch fetch stock data with caching and retries") + p.add_argument( + "--tickers", + nargs="*", + default=None, + help="Space-separated list of ticker symbols (e.g., AAPL MSFT GOOGL)", + ) + p.add_argument( + "--tickers-file", + type=str, + default=None, + help="Path to a file containing ticker symbols (one per line, # for comments)", + ) + p.add_argument( + "--batch-size", + type=int, + default=10, + help="Number of tickers per batch request (default: 10)", + ) + p.add_argument( + "--period", + type=str, + default="1y", + help="yfinance period (e.g., 1mo, 3mo, 6mo, 1y, 2y, max). Default: 1y", + ) + p.add_argument( + "--interval", + type=str, + default="1d", + help="yfinance interval (e.g., 1d, 1wk, 1mo). Default: 1d", + ) + p.add_argument( + "--data-dir", + type=str, + default="data", + help="Directory where per-ticker CSVs will be stored. Default: data", + ) + p.add_argument( + "--max-retries", + type=int, + default=3, + help="Max retries per batch on failure. Default: 3", + ) + p.add_argument( + "--backoff-base", + type=float, + default=1.5, + help="Base seconds for exponential backoff. Default: 1.5", + ) + p.add_argument( + "--backoff-max", + type=float, + default=30.0, + help="Max seconds to sleep between retries. Default: 30", + ) + return p.parse_args() + + +def load_tickers(args: argparse.Namespace) -> List[str]: + tickers: List[str] = [] + + if args.tickers_file: + if not os.path.exists(args.tickers_file): + raise FileNotFoundError(f"Tickers file not found: {args.tickers_file}") + with open(args.tickers_file, "r", encoding="utf-8") as f: + for line in f: + s = line.strip() + if not s or s.startswith("#"): + continue + tickers.append(s.upper()) + + if args.tickers: + tickers.extend([t.upper() for t in args.tickers]) + + # Fallback defaults if user provided none + if not tickers: + tickers = ["AAPL", "MSFT", "GOOGL", "AMZN"] + + # Deduplicate while preserving order + seen = set() + deduped = [] + for t in tickers: + if t not in seen: + deduped.append(t) + seen.add(t) + return deduped + + +def ensure_dirs(*paths: str) -> None: + for p in paths: + os.makedirs(p, exist_ok=True) + + +def is_cache_fresh(path: str, freshness_seconds: int = FRESHNESS_SECONDS) -> bool: + if not os.path.exists(path): + return False + age = time.time() - os.path.getmtime(path) + return age < freshness_seconds + + +def chunked(seq: Sequence[str], size: int) -> Iterable[List[str]]: + for i in range(0, len(seq), size): + yield list(seq[i : i + size]) + + +def looks_like_rate_limit(err: Exception) -> bool: + msg = str(err).lower() + # Common signals: HTTP 429, Too Many Requests, rate limit + return ("429" in msg) or ("too many request" in msg) or ("rate limit" in msg) + + +def download_batch_with_retries( + tickers: List[str], + period: str, + interval: str, + max_retries: int, + backoff_base: float, + backoff_max: float, + logger: logging.Logger, +) -> pd.DataFrame: + attempt = 0 + last_exc: Exception | None = None + + while attempt <= max_retries: + try: + # threads=False to be gentler on rate limits + df = yf.download( + tickers=tickers, + period=period, + interval=interval, + group_by="ticker", # easier to split per ticker + auto_adjust=False, + threads=False, + progress=False, + ) + if df is None or (isinstance(df, pd.DataFrame) and df.empty): + raise ValueError(f"Empty data returned for batch: {tickers}") + return df + except Exception as e: + last_exc = e + attempt += 1 + if attempt > max_retries: + # Log at error on final failure + logger.error( + "Batch failed after %d retries for %s | error=%s", + max_retries, + ",".join(tickers), + repr(e), + ) + break + + # Warn and backoff with jitter + kind = "rate-limit" if looks_like_rate_limit(e) else "error" + sleep_for = min(backoff_base * (2 ** (attempt - 1)) + random.uniform(0, 0.5), backoff_max) + logger.warning( + "Batch %s on attempt %d/%d for %s; backing off %.2fs | error=%s", + kind, + attempt, + max_retries, + ",".join(tickers), + sleep_for, + repr(e), + ) + time.sleep(sleep_for) + # If we reach here, all retries failed + if last_exc: + raise last_exc + raise RuntimeError("download_batch_with_retries failed for unknown reasons") + + +def save_per_ticker_csv( + df: pd.DataFrame, + tickers: List[str], + data_dir: str, + logger: logging.Logger, +) -> None: + """Save DataFrame to per-ticker CSVs. Handles both single and multi-ticker shapes.""" + # Shape 1: group_by="ticker" with multiple tickers -> columns MultiIndex (level 0=ticker, level 1=fields) + if isinstance(df.columns, pd.MultiIndex) and df.columns.nlevels == 2: + level0 = df.columns.get_level_values(0) + present = set(level0.unique()) + for t in tickers: + if t not in present: + logger.warning("Ticker %s not present in returned data; skipping save", t) + continue + tdf = df[t] + tdf = tdf.dropna(how="all") + if tdf.empty: + logger.warning("Ticker %s resulted in empty frame after dropna; skipping", t) + continue + out_path = os.path.join(data_dir, f"{t}.csv") + tdf.to_csv(out_path, index=True) + logger.info("Saved %s (%d rows) -> %s", t, len(tdf), out_path) + return + + # Shape 2: single ticker or non-MultiIndex + tdf = df.dropna(how="all") + if not tdf.empty: + # If we can infer a single ticker name from columns, use it; otherwise use a generic name + inferred = None + # Try common column names to infer ticker from name; yfinance sometimes sets df.columns.name + if getattr(df.columns, "name", None): + inferred = str(df.columns.name) + # Fallback: if only one ticker requested, use that + if len(tickers) == 1: + inferred = tickers[0] + # Absolute fallback + if not inferred: + inferred = "ticker" + out_path = os.path.join(data_dir, f"{inferred}.csv") + tdf.to_csv(out_path, index=True) + logger.info("Saved %s (%d rows) -> %s", inferred, len(tdf), out_path) + else: + logger.warning("Download returned empty DataFrame; nothing saved") + + +def main() -> None: + args = parse_args() + logger = setup_logging() + + ensure_dirs(args.data_dir, "logs") + + tickers = load_tickers(args) + logger.info("Total tickers requested: %d", len(tickers)) + + # Filter tickers that actually need an update (cache older than 24h) + to_update: List[str] = [] + for t in tickers: + out_path = os.path.join(args.data_dir, f"{t}.csv") + if is_cache_fresh(out_path, FRESHNESS_SECONDS): + logger.info("Up-to-date (<=24h): %s", t) + else: + to_update.append(t) + + if not to_update: + logger.info("All tickers are up-to-date. Nothing to fetch.") + return + + logger.info("Tickers to update: %d", len(to_update)) + + # Process in batches + for batch_num, batch in enumerate(chunked(to_update, args.batch_size), start=1): + logger.info("Fetching batch %d: %s", batch_num, ",".join(batch)) + try: + df = download_batch_with_retries( + tickers=batch, + period=args.period, + interval=args.interval, + max_retries=args.max_retries, + backoff_base=args.backoff_base, + backoff_max=args.backoff_max, + logger=logger, + ) + save_per_ticker_csv(df, batch, args.data_dir, logger) + except Exception as e: + # Already logged inside retries; ensure an error log exists as well + logger.error("Failed batch %d for %s | error=%s", batch_num, ",".join(batch), repr(e)) + finally: + # Polite delay between batches to reduce rate-limit hits + delay = random.uniform(1.0, 2.0) + logger.info("Sleeping %.2fs before next batch...", delay) + time.sleep(delay) + + logger.info("Done.") + + +if __name__ == "__main__": + main() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000..ea9e7d4bd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +yfinance>=0.2.0 +pandas>=1.3.0 diff --git a/tickers.txt b/tickers.txt new file mode 100644 index 0000000000..7541441f65 --- /dev/null +++ b/tickers.txt @@ -0,0 +1,18 @@ +# Sample tickers file - one ticker per line +# Lines starting with # are comments and will be ignored + +# Big Tech +AAPL +MSFT +GOOGL +AMZN +META + +# Other Popular Stocks +TSLA +NVDA +JPM +UNH +V + +# Add your own tickers below: