acl-org · mbollmann · Aug 13, 2025
diff --git a/python/acl_anthology/anthology.py b/python/acl_anthology/anthology.py
@@ -43,6 +43,7 @@
 from .sigs import SIGIndex
 from .venues import VenueIndex
 
+CacheDict: TypeAlias = dict[str, str | tuple[int, int]]
 NameSpecificationOrIter: TypeAlias = NameSpecification | Iterator[NameSpecification]
 PersonOrList: TypeAlias = Person | list[Person]
 
@@ -95,6 +96,26 @@ def _check_schema_compatibility(self) -> None:
         if datadir_schema != expected_schema:
             warnings.warn(SchemaMismatchWarning())
 
+    def _compute_cache_dict(self, depends_on: list[str]) -> CacheDict:
+        """Compute a dictionary of file stats for caching purposes.
+
+        Arguments:
+            depends_on: A list of files or glob patterns in the Anthology's data directory that the cache depends on.
+
+        Returns:
+            A dictionary containing {'datadir': self.datadir} plus an entry with stats for every file that matches the supplied glob patterns.
+        """
+        cache_dict: CacheDict = {"datadir": str(self.datadir.resolve())}
+        for pattern in depends_on:
+            for path in self.datadir.glob(pattern):
+                if path.is_file():
+                    stat = path.stat()
+                    cache_dict[str(path.relative_to(self.datadir))] = (
+                        stat.st_size,
+                        int(stat.st_mtime),
+                    )
+        return cache_dict
+
     @classmethod
     def from_repo(
         cls,

diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py
@@ -249,7 +249,6 @@ def load(self) -> None:
         if self.is_data_loaded:
             return
 
-        log.debug(f"Parsing XML data file: {self.path}")
         current_volume = cast(Volume, None)  # noqa: F841
         for _, element in etree.iterparse(
             self.path,

diff --git a/python/acl_anthology/config.py b/python/acl_anthology/config.py
@@ -16,9 +16,14 @@
 
 from attrs import define
 from omegaconf import OmegaConf
+from pathlib import Path
 from platformdirs import PlatformDirs
 
 
+dirs = PlatformDirs("acl-anthology")
+"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data."""
+
+
 @define
 class DefaultConfig:
     url_prefix: str = "${oc.env:ANTHOLOGY_PREFIX,https://aclanthology.org}"
@@ -51,9 +56,12 @@ class DefaultConfig:
     disable_gc: bool = True
     """If True, disables garbage collection while parsing XML files and building indices.  This typically results in a considerable speed-up, but if it happens to cause problems, it can be disabled here."""
 
+    cache_path: Path = dirs.user_cache_path
+    """Path where cache files should be saved/loaded."""
+
+    disable_caching: bool = False
+    """If True, disables both saving & loading of cache files."""
+
 
 config = OmegaConf.structured(DefaultConfig)
 """A [structured configuration instance](https://omegaconf.readthedocs.io/en/latest/structured_config.html) that is used by all `acl_anthology` classes."""
-
-dirs = PlatformDirs("acl-anthology")
-"""A [PlatformDirs instance](https://platformdirs.readthedocs.io/en/latest/api.html#platformdirs) that returns platform-specific directories for storing data."""
diff --git a/python/acl_anthology/people/index.py b/python/acl_anthology/people/index.py
@@ -14,10 +14,12 @@
 
 from __future__ import annotations
 
+import attrs
 from attrs import define, field
 from collections.abc import Iterable
 from collections import Counter, defaultdict
 import itertools as it
+import msgpack
 from pathlib import Path
 from rich.progress import track
 from scipy.cluster.hierarchy import DisjointSet  # type: ignore
@@ -30,6 +32,7 @@
 except ImportError:  # pragma: no cover
     from yaml import Loader, Dumper  # type: ignore
 
+from ..config import config
 from ..containers import SlottedDict
 from ..exceptions import (
     AnthologyException,
@@ -44,7 +47,7 @@
 
 if TYPE_CHECKING:
     from _typeshed import StrPath
-    from ..anthology import Anthology
+    from ..anthology import Anthology, CacheDict
     from ..collections import Paper, Volume
 
 log = get_logger()
@@ -93,6 +96,10 @@ class PersonIndex(SlottedDict[Person]):
     def _path(self) -> Path:
         return self.parent.datadir / Path(PEOPLE_INDEX_FILE)
 
+    @property
+    def _cache_file(self) -> Path:
+        return cast(Path, config.cache_path) / "PersonIndex.cache"
+
     @property
     def by_orcid(self) -> dict[str, str]:
         if not self.is_data_loaded:
@@ -208,10 +215,10 @@ def find_coauthors_counter(
 
     def load(self) -> None:
         """Loads or builds the index."""
-        # This function exists so we can later add the option to read the index
-        # from a cache if it doesn't need re-building.
         if self.is_data_loaded:
             return
+        if not config.disable_caching and self._load_cache():
+            return
         self.build(show_progress=self.verbose)
 
     def reset(self) -> None:
@@ -273,6 +280,8 @@ def build(self, show_progress: bool = False) -> None:
                 "An exception was raised while building PersonIndex; check the logger for details."
             )  # pragma: no cover
         self.is_data_loaded = True
+        if not config.disable_caching:
+            self._save_cache()
 
     def _load_people_index(self) -> None:
         """Load and parse the `people.yaml` file.
@@ -621,3 +630,60 @@ def save(self, path: Optional[StrPath] = None) -> None:
 
         with open(path, "w", encoding="utf-8") as f:
             yaml.dump(data, f, allow_unicode=True, Dumper=Dumper)
+
+    def _compute_cache_dict(self) -> CacheDict:
+        """Compute the cache dictionary for this index.
+
+        If the return value is identical between a saved cache file and this instance, the data can be loaded from the cache.
+        """
+        return self.parent._compute_cache_dict(depends_on=["xml/*", "yaml/people.yaml"])
+
+    def _save_cache(self) -> None:
+        """Save the entire PersonIndex to a cache file."""
+        config.cache_path.mkdir(parents=True, exist_ok=True)
+
+        with open(self._cache_file, "wb") as f:
+            # The first saved message is the cache key
+            msgpack.pack(self._compute_cache_dict(), f)
+            # We serialize each Person in the index as a single message
+            for person in self.values():
+                msgpack.pack(
+                    attrs.asdict(
+                        person,
+                        filter=lambda attr, value: value and attr.name != "parent",
+                        value_serializer=lambda _, __, value: (
+                            value if not isinstance(value, NameLink) else value.value
+                        ),
+                    ),
+                    f,
+                )
+
+    def _load_cache(self) -> bool:
+        """Load the entire PersonIndex from a cache file, if possible.
+
+        Checks if the cache file exists and only loads it if its key is compatible with this Anthology instance (i.e. no files that this cache depends on appear to have changed).
+
+        Returns:
+            True if the PersonIndex could be loaded from a cache file.
+        """
+        if not self._cache_file.exists():
+            return False
+
+        with open(self._cache_file, "rb") as f:
+            unpacker = msgpack.Unpacker(f, use_list=False)
+            cache_key = next(unpacker)
+            if cache_key != self._compute_cache_dict():
+                # Cache invalid
+                return False
+
+            # Load from cache
+            self.reset()
+            print(f"Loading PersonIndex from cache file {self._cache_file}")
+            for data in unpacker:
+                data["names"] = (
+                    (Name.from_dict(x[0]), NameLink(x[1])) for x in data.pop("_names")
+                )
+                self.add_person(Person(parent=self.parent, **data))
+
+        self.is_data_loaded = True
+        return True
diff --git a/python/acl_anthology/people/name.py b/python/acl_anthology/people/name.py
@@ -132,14 +132,15 @@ def slugify(self) -> str:
     def from_dict(cls, name: dict[str, str]) -> Name:
         """
         Parameters:
-            name: A dictionary with "first" and "last" keys.
+            name: A dictionary with "first", "last", and "script" keys.  Only "last" is required.
 
         Returns:
             A corresponding Name object.
         """
         return cls(
             name.get("first"),
             name["last"],
+            script=name.get("script"),
         )
 
     @classmethod

diff --git a/python/acl_anthology/people/person.py b/python/acl_anthology/people/person.py
@@ -96,15 +96,17 @@ class Person:
         factory=list, converter=_name_list_converter
     )
     item_ids: list[AnthologyIDTuple] = field(
-        factory=list, repr=lambda x: f"<list of {len(x)} AnthologyIDTuple objects>"
+        factory=list,
+        converter=list,
+        repr=lambda x: f"<list of {len(x)} AnthologyIDTuple objects>",
     )
     orcid: Optional[str] = field(
         default=None,
         on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_person_index),
     )  # validator defined below
     comment: Optional[str] = field(default=None)
     degree: Optional[str] = field(default=None)
-    similar_ids: list[str] = field(factory=list)
+    similar_ids: list[str] = field(factory=list, converter=list)
     disable_name_matching: Optional[bool] = field(default=False, converter=bool)
     is_explicit: Optional[bool] = field(default=False, converter=bool)