diff --git a/src/index_503/index.py b/src/index_503/index.py index 919dd0f..fabad92 100644 --- a/src/index_503/index.py +++ b/src/index_503/index.py @@ -2,6 +2,7 @@ import logging import os from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor from operator import attrgetter from pathlib import Path from shutil import rmtree @@ -90,11 +91,10 @@ def _atomic_replace_old_index(self, temp_dir_path: Path, target_path: Path) -> N def _make_index_at_temp_dir(self, temp_dir_path: Path) -> None: """Generate a simple repository of Python wheels in a temp dir.""" - new_wheel_file_objects: list[WheelFile] = [] projects: dict[str, list[WheelFile]] = defaultdict(list) - wheel_file_name_to_metadata_path: dict[str, Path] = {} all_wheel_files: set[str] = set() raw_cache = self.cache.cache + misses: list[tuple[Path, Path, Path]] = [] for wheel_file in glob.glob(str(self.origin_path.joinpath("*.whl"))): wheel_path = Path(wheel_file) @@ -108,16 +108,25 @@ def _make_index_at_temp_dir(self, temp_dir_path: Path) -> None: wheel_file_obj := WheelFile.from_cache(wheel_cache, mtime, size) ): os.link(self.target_path.joinpath(metadata_path.name), metadata_path) - elif wheel_file_obj := WheelFile.from_wheel(wheel_path, metadata_path): - wheel_file_name_to_metadata_path[wheel_file_name] = metadata_path - new_wheel_file_objects.append(wheel_file_obj) - raw_cache[wheel_file_name] = wheel_file_obj.as_dict() + projects[wheel_file_obj.canonical_name].append(wheel_file_obj) + os.link(wheel_path, target_file) else: - continue - - canonical_name = wheel_file_obj.canonical_name - projects[canonical_name].append(wheel_file_obj) - os.link(wheel_path, target_file) + misses.append((wheel_path, target_file, metadata_path)) + + if misses: + # from_wheel is I/O- and hash-bound (both release the GIL); each + # task writes only to its own metadata_path, so threads are safe. + with ThreadPoolExecutor() as executor: + results = executor.map( + lambda args: (args, WheelFile.from_wheel(args[0], args[2])), + misses, + ) + for (wheel_path, target_file, _), wheel_file_obj in results: + if wheel_file_obj is None: + continue + raw_cache[wheel_path.name] = wheel_file_obj.as_dict() + projects[wheel_file_obj.canonical_name].append(wheel_file_obj) + os.link(wheel_path, target_file) self.cache.remove_stale_keys(all_wheel_files) self.generate_index_pages(temp_dir_path, projects) diff --git a/src/index_503/util.py b/src/index_503/util.py index ceb0a53..93eb74c 100644 --- a/src/index_503/util.py +++ b/src/index_503/util.py @@ -26,11 +26,20 @@ def get_mtime_and_size_from_path(path: Path) -> tuple[float, int]: return stat.st_mtime, stat.st_size +_HASH_CHUNK_SIZE = 1024 * 1024 + + def get_sha256_hash(filename: Path) -> str: - """Get SHA256 hash of a file.""" + """Get SHA256 hash of a file. + + Streams the file in chunks to avoid loading large wheels entirely + into memory. + """ + hasher = sha256() with filename.open("rb") as f: - bytes = f.read() # read entire file as bytes - return sha256(bytes).hexdigest() + for chunk in iter(lambda: f.read(_HASH_CHUNK_SIZE), b""): + hasher.update(chunk) + return hasher.hexdigest() def load_json_file(filename: Path) -> dict[str, dict[str, Any]]: