Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 20 additions & 11 deletions src/index_503/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from operator import attrgetter
from pathlib import Path
from shutil import rmtree
Expand Down Expand Up @@ -90,11 +91,10 @@ def _atomic_replace_old_index(self, temp_dir_path: Path, target_path: Path) -> N

def _make_index_at_temp_dir(self, temp_dir_path: Path) -> None:
"""Generate a simple repository of Python wheels in a temp dir."""
new_wheel_file_objects: list[WheelFile] = []
projects: dict[str, list[WheelFile]] = defaultdict(list)
wheel_file_name_to_metadata_path: dict[str, Path] = {}
all_wheel_files: set[str] = set()
raw_cache = self.cache.cache
misses: list[tuple[Path, Path, Path]] = []

for wheel_file in glob.glob(str(self.origin_path.joinpath("*.whl"))):
wheel_path = Path(wheel_file)
Expand All @@ -108,16 +108,25 @@ def _make_index_at_temp_dir(self, temp_dir_path: Path) -> None:
wheel_file_obj := WheelFile.from_cache(wheel_cache, mtime, size)
):
os.link(self.target_path.joinpath(metadata_path.name), metadata_path)
elif wheel_file_obj := WheelFile.from_wheel(wheel_path, metadata_path):
wheel_file_name_to_metadata_path[wheel_file_name] = metadata_path
new_wheel_file_objects.append(wheel_file_obj)
raw_cache[wheel_file_name] = wheel_file_obj.as_dict()
projects[wheel_file_obj.canonical_name].append(wheel_file_obj)
os.link(wheel_path, target_file)
else:
continue

canonical_name = wheel_file_obj.canonical_name
projects[canonical_name].append(wheel_file_obj)
os.link(wheel_path, target_file)
misses.append((wheel_path, target_file, metadata_path))

if misses:
# from_wheel is I/O- and hash-bound (both release the GIL); each
# task writes only to its own metadata_path, so threads are safe.
with ThreadPoolExecutor() as executor:
results = executor.map(
lambda args: (args, WheelFile.from_wheel(args[0], args[2])),
misses,
)
for (wheel_path, target_file, _), wheel_file_obj in results:
if wheel_file_obj is None:
continue
raw_cache[wheel_path.name] = wheel_file_obj.as_dict()
projects[wheel_file_obj.canonical_name].append(wheel_file_obj)
os.link(wheel_path, target_file)

self.cache.remove_stale_keys(all_wheel_files)
self.generate_index_pages(temp_dir_path, projects)
Expand Down
15 changes: 12 additions & 3 deletions src/index_503/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,20 @@ def get_mtime_and_size_from_path(path: Path) -> tuple[float, int]:
return stat.st_mtime, stat.st_size


_HASH_CHUNK_SIZE = 1024 * 1024


def get_sha256_hash(filename: Path) -> str:
"""Get SHA256 hash of a file."""
"""Get SHA256 hash of a file.

Streams the file in chunks to avoid loading large wheels entirely
into memory.
"""
hasher = sha256()
with filename.open("rb") as f:
bytes = f.read() # read entire file as bytes
return sha256(bytes).hexdigest()
for chunk in iter(lambda: f.read(_HASH_CHUNK_SIZE), b""):
hasher.update(chunk)
return hasher.hexdigest()


def load_json_file(filename: Path) -> dict[str, dict[str, Any]]:
Expand Down