Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions python/acl_anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from .sigs import SIGIndex
from .venues import VenueIndex

CacheDict: TypeAlias = dict[str, str | tuple[int, int]]
NameSpecificationOrIter: TypeAlias = NameSpecification | Iterator[NameSpecification]
PersonOrList: TypeAlias = Person | list[Person]

Expand Down Expand Up @@ -95,6 +96,26 @@ def _check_schema_compatibility(self) -> None:
if datadir_schema != expected_schema:
warnings.warn(SchemaMismatchWarning())

def _compute_cache_dict(self, depends_on: list[str]) -> CacheDict:
"""Compute a dictionary of file stats for caching purposes.

Arguments:
depends_on: A list of files or glob patterns in the Anthology's data directory that the cache depends on.

Returns:
A dictionary containing {'datadir': self.datadir} plus an entry with stats for every file that matches the supplied glob patterns.
"""
cache_dict: CacheDict = {"datadir": str(self.datadir.resolve())}
for pattern in depends_on:
for path in self.datadir.glob(pattern):
if path.is_file():
stat = path.stat()
cache_dict[str(path.relative_to(self.datadir))] = (
stat.st_size,
int(stat.st_mtime),
)
return cache_dict

@classmethod
def from_repo(
cls,
Expand Down
1 change: 0 additions & 1 deletion python/acl_anthology/collections/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def load(self) -> None:
if self.is_data_loaded:
return

log.debug(f"Parsing XML data file: {self.path}")
current_volume = cast(Volume, None) # noqa: F841
for _, element in etree.iterparse(
self.path,
Expand Down
14 changes: 11 additions & 3 deletions python/acl_anthology/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@

from attrs import define
from omegaconf import OmegaConf
from pathlib import Path
from platformdirs import PlatformDirs


dirs = PlatformDirs("acl-anthology")
"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data."""


@define
class DefaultConfig:
url_prefix: str = "${oc.env:ANTHOLOGY_PREFIX,https://aclanthology.org}"
Expand Down Expand Up @@ -51,9 +56,12 @@ class DefaultConfig:
disable_gc: bool = True
"""If True, disables garbage collection while parsing XML files and building indices. This typically results in a considerable speed-up, but if it happens to cause problems, it can be disabled here."""

cache_path: Path = dirs.user_cache_path
"""Path where cache files should be saved/loaded."""

disable_caching: bool = False
"""If True, disables both saving & loading of cache files."""


config = OmegaConf.structured(DefaultConfig)
"""A [structured configuration instance](https://omegaconf.readthedocs.io/en/latest/structured_config.html) that is used by all `acl_anthology` classes."""

dirs = PlatformDirs("acl-anthology")
"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data."""
72 changes: 69 additions & 3 deletions python/acl_anthology/people/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@

from __future__ import annotations

import attrs
from attrs import define, field
from collections.abc import Iterable
from collections import Counter, defaultdict
import itertools as it
import msgpack
from pathlib import Path
from rich.progress import track
from scipy.cluster.hierarchy import DisjointSet # type: ignore
Expand All @@ -30,6 +32,7 @@
except ImportError: # pragma: no cover
from yaml import Loader, Dumper # type: ignore

from ..config import config
from ..containers import SlottedDict
from ..exceptions import (
AnthologyException,
Expand All @@ -44,7 +47,7 @@

if TYPE_CHECKING:
from _typeshed import StrPath
from ..anthology import Anthology
from ..anthology import Anthology, CacheDict
from ..collections import Paper, Volume

log = get_logger()
Expand Down Expand Up @@ -93,6 +96,10 @@ class PersonIndex(SlottedDict[Person]):
def _path(self) -> Path:
return self.parent.datadir / Path(PEOPLE_INDEX_FILE)

@property
def _cache_file(self) -> Path:
return cast(Path, config.cache_path) / "PersonIndex.cache"

@property
def by_orcid(self) -> dict[str, str]:
if not self.is_data_loaded:
Expand Down Expand Up @@ -208,10 +215,10 @@ def find_coauthors_counter(

def load(self) -> None:
"""Loads or builds the index."""
# This function exists so we can later add the option to read the index
# from a cache if it doesn't need re-building.
if self.is_data_loaded:
return
if not config.disable_caching and self._load_cache():
return
self.build(show_progress=self.verbose)

def reset(self) -> None:
Expand Down Expand Up @@ -273,6 +280,8 @@ def build(self, show_progress: bool = False) -> None:
"An exception was raised while building PersonIndex; check the logger for details."
) # pragma: no cover
self.is_data_loaded = True
if not config.disable_caching:
self._save_cache()

def _load_people_index(self) -> None:
"""Load and parse the `people.yaml` file.
Expand Down Expand Up @@ -621,3 +630,60 @@ def save(self, path: Optional[StrPath] = None) -> None:

with open(path, "w", encoding="utf-8") as f:
yaml.dump(data, f, allow_unicode=True, Dumper=Dumper)

def _compute_cache_dict(self) -> CacheDict:
"""Compute the cache dictionary for this index.

If the return value is identical between a saved cache file and this instance, the data can be loaded from the cache.
"""
return self.parent._compute_cache_dict(depends_on=["xml/*", "yaml/people.yaml"])

def _save_cache(self) -> None:
"""Save the entire PersonIndex to a cache file."""
config.cache_path.mkdir(parents=True, exist_ok=True)

with open(self._cache_file, "wb") as f:
# The first saved message is the cache key
msgpack.pack(self._compute_cache_dict(), f)
# We serialize each Person in the index as a single message
for person in self.values():
msgpack.pack(
attrs.asdict(
person,
filter=lambda attr, value: value and attr.name != "parent",
value_serializer=lambda _, __, value: (
value if not isinstance(value, NameLink) else value.value
),
),
f,
)

def _load_cache(self) -> bool:
"""Load the entire PersonIndex from a cache file, if possible.

Checks if the cache file exists and only loads it if its key is compatible with this Anthology instance (i.e. no files that this cache depends on appear to have changed).

Returns:
True if the PersonIndex could be loaded from a cache file.
"""
if not self._cache_file.exists():
return False

with open(self._cache_file, "rb") as f:
unpacker = msgpack.Unpacker(f, use_list=False)
cache_key = next(unpacker)
if cache_key != self._compute_cache_dict():
# Cache invalid
return False

# Load from cache
self.reset()
print(f"Loading PersonIndex from cache file {self._cache_file}")
for data in unpacker:
data["names"] = (
(Name.from_dict(x[0]), NameLink(x[1])) for x in data.pop("_names")
)
self.add_person(Person(parent=self.parent, **data))

self.is_data_loaded = True
return True
3 changes: 2 additions & 1 deletion python/acl_anthology/people/name.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,15 @@ def slugify(self) -> str:
def from_dict(cls, name: dict[str, str]) -> Name:
"""
Parameters:
name: A dictionary with "first" and "last" keys.
name: A dictionary with "first", "last", and "script" keys. Only "last" is required.

Returns:
A corresponding Name object.
"""
return cls(
name.get("first"),
name["last"],
script=name.get("script"),
)

@classmethod
Expand Down
6 changes: 4 additions & 2 deletions python/acl_anthology/people/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,17 @@ class Person:
factory=list, converter=_name_list_converter
)
item_ids: list[AnthologyIDTuple] = field(
factory=list, repr=lambda x: f"<list of {len(x)} AnthologyIDTuple objects>"
factory=list,
converter=list,
repr=lambda x: f"<list of {len(x)} AnthologyIDTuple objects>",
)
orcid: Optional[str] = field(
default=None,
on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_person_index),
) # validator defined below
comment: Optional[str] = field(default=None)
degree: Optional[str] = field(default=None)
similar_ids: list[str] = field(factory=list)
similar_ids: list[str] = field(factory=list, converter=list)
disable_name_matching: Optional[bool] = field(default=False, converter=bool)
is_explicit: Optional[bool] = field(default=False, converter=bool)

Expand Down
Loading
Loading