LernerLab · pauladkisson · Mar 25, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,3 @@
 # Auto detect text files and perform LF normalization
 * text=auto
+stubbed_testing_data/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/first_and_last_python_versions.txt b/.github/workflows/first_and_last_python_versions.txt
@@ -0,0 +1 @@
+["3.10", "3.13"]
diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
@@ -21,7 +21,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - id: load_python_versions
-        run: echo "python_versions=$(cat ./.github/workflows/all_python_versions.txt)" >> "$GITHUB_OUTPUT"
+        run: echo "python_versions=$(cat ./.github/workflows/first_and_last_python_versions.txt)" >> "$GITHUB_OUTPUT"
       - id: load_os_versions
         run: echo "os_versions=$(cat ./.github/workflows/all_os_versions.txt)" >> "$GITHUB_OUTPUT"
       - name: Debugging
@@ -49,11 +49,11 @@ jobs:
     needs: [load_python_and_os_versions]
     uses: ./.github/workflows/run-tests.yml
     secrets:
-      RCLONE_CONFIG: ${{ secrets.RCLONE_CONFIG }}
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     with:  #  Ternary operator: condition && value_if_true || value_if_false
       python-versions: ${{ github.event.pull_request.draft == true && '["3.10"]' || needs.load_python_and_os_versions.outputs.ALL_PYTHON_VERSIONS }}
       os-versions: ${{ github.event.pull_request.draft == true && '["ubuntu-latest"]' || needs.load_python_and_os_versions.outputs.ALL_OS_VERSIONS }}
+      skip-full-data-tests: true
 
   check-final-status:
     name: All tests passing

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -11,9 +11,14 @@ on:
         description: 'List of OS versions to use in matrix, as JSON string'
         required: true
         type: string
+      skip-full-data-tests:
+        description: 'Whether to skip tests marked full_data'
+        required: false
+        type: boolean
+        default: false
     secrets:
       RCLONE_CONFIG:
-        required: true
+        required: false
       CODECOV_TOKEN:
         required: true
   workflow_dispatch:
@@ -28,18 +33,26 @@ on:
         required: true
         type: string
         default: '["ubuntu-latest", "windows-2022", "macos-latest"]'
+      skip-full-data-tests:
+        description: 'Whether to skip tests marked full_data'
+        required: false
+        type: boolean
+        default: false
 
 jobs:
   run:
     name: ${{ matrix.os }} Python ${{ matrix.python-version }}
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
     strategy:
       fail-fast: false
       matrix:
         python-version: ${{ fromJson(inputs.python-versions) }}
         os: ${{ fromJson(inputs.os-versions) }}
     steps:
       - uses: actions/checkout@v5
+        with:
+          lfs: true
       - name: Setup Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
@@ -54,12 +67,15 @@ jobs:
           python -m pip install --group test
 
       - name: Prepare data for tests
+        if: ${{ !inputs.skip-full-data-tests }}
         uses: ./.github/actions/load-data
         with:
           rclone-config: ${{ secrets.RCLONE_CONFIG }}
 
       - name: Run tests
-        run: pytest tests -vv -rsx --dist loadscope --cov=guppy --cov-branch --cov-report=xml # TODO: add -n auto when tests no longer use multiprocessing
+        run: pytest tests -vv -rsx -n auto --dist worksteal -m "not parallel${{ inputs.skip-full-data-tests && ' and not full_data' || '' }}" --cov=guppy --cov-branch --cov-report=xml
+      - name: Run parallel tests
+        run: pytest tests -vv -rsx -m "parallel and not full_data"
       - name: Upload full coverage to Codecov
         if: ${{ matrix.python-version == '3.10' && matrix.os == 'ubuntu-latest' }}
         uses: codecov/codecov-action@v5

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,24 +3,29 @@ repos:
     rev: v6.0.0
     hooks:
     -   id: check-yaml
+        exclude: ^stubbed_testing_data/
     -   id: end-of-file-fixer
+        exclude: ^stubbed_testing_data/
     -   id: trailing-whitespace
+        exclude: ^stubbed_testing_data/
 
 -   repo: https://github.com/psf/black
     rev: 25.1.0
     hooks:
     -   id: black
-        exclude: ^docs/
+        exclude: ^(docs|stubbed_testing_data)/
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.13.0
   hooks:
   - id: ruff
     args: [ --fix ]
+    exclude: ^stubbed_testing_data/
 
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.1
   hooks:
   - id: codespell
+    exclude: ^stubbed_testing_data/
     additional_dependencies:
     - tomli
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@
 - Restored automatic data modality detection and mixed-modality TTL/signal support with a modular, separation-of-concerns architecture: [PR #226](https://github.com/LernerLab/GuPPy/pull/226)
 - Expanded test suite with consistency tests that compare results to GuPPy-v1.3.0: [PR #207](https://github.com/LernerLab/GuPPy/pull/207)
 - Expanded test suite with unit tests for recording extractor classes: [PR #240](https://github.com/LernerLab/GuPPy/pull/240)
+- Migrated testing datasets from Google Drive to GitHub LFS with comprehensive documentation and CI/CD integration: [PR #242](https://github.com/LernerLab/GuPPy/pull/242)
 
 # GuPPy-v1.3.0 (August 12th, 2025)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -131,3 +131,9 @@ ignore-words-list = 'assertin,sortings'
 concurrency = ["multiprocessing"]
 sigterm = true
 branch = true
+
+[tool.pytest.ini_options]
+markers = [
+    "full_data: marks tests that require the full testing_data download (deselect with '-m not full_data')",
+    "parallel: marks tests that verify GuPPy multiprocessing behavior (run separately from the main suite)",
+]
diff --git a/src/guppy/extractors/base_recording_extractor.py b/src/guppy/extractors/base_recording_extractor.py
@@ -76,6 +76,25 @@ def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None:
         """
         pass
 
+    @abstractmethod
+    def stub(self, *, folder_path, duration_in_seconds=1.0):
+        """
+        Create a stubbed copy of the data folder truncated to a short duration.
+
+        Copies the source folder to `folder_path`, then truncates data files so
+        that only the first `duration_in_seconds` of recorded data are retained.
+        If `folder_path` already exists it is overwritten.
+
+        Parameters
+        ----------
+        folder_path : str or Path
+            Destination directory for the stubbed data. Created if it does not
+            exist; overwritten if it already exists.
+        duration_in_seconds : float, optional
+            Approximate duration of data to retain in seconds. Default is 1.0.
+        """
+        pass
+
     @staticmethod
     def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None:
         """

diff --git a/src/guppy/extractors/csv_recording_extractor.py b/src/guppy/extractors/csv_recording_extractor.py
@@ -1,7 +1,8 @@
-import copy
 import glob
 import logging
 import os
+import shutil
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -184,15 +185,70 @@ def _save_to_hdf5(self, df, event, outputPath):
     def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]:
         output_dicts = []
         for event in events:
-            df = self._read_csv(event=event)
-            S = df.to_dict()
-            S["storename"] = event
-            output_dicts.append(S)
+            dataframe = self._read_csv(event=event)
+            columns_lowercase = [col.lower() for col in dataframe.columns]
+            if "data" in columns_lowercase:
+                output_dicts.append(
+                    {
+                        "storename": event,
+                        "timestamps": dataframe["timestamps"].dropna().to_numpy(),
+                        "data": dataframe["data"].dropna().to_numpy(),
+                        "sampling_rate": dataframe["sampling_rate"].dropna().to_numpy()[:1],
+                    }
+                )
+            else:
+                output_dicts.append(
+                    {
+                        "storename": event,
+                        "timestamps": dataframe["timestamps"].dropna().to_numpy(),
+                    }
+                )
         return output_dicts
 
+    def stub(self, *, folder_path, duration_in_seconds=1.0):
+        """
+        Create a stubbed copy of the CSV folder with truncated data files.
+
+        Copies the entire folder to `folder_path`, then replaces each CSV file
+        with a version truncated to `duration_in_seconds`. The cutoff timestamp
+        is computed as the first timestamp in the first data CSV plus
+        `duration_in_seconds`. Both data CSVs (3-column) and event CSVs
+        (1-column) are filtered to rows at or before the cutoff.
+
+        Parameters
+        ----------
+        folder_path : str or Path
+            Destination directory for the stubbed folder. Created if it does
+            not exist; overwritten if it already exists.
+        duration_in_seconds : float, optional
+            Approximate duration of data to retain in seconds. Default is 1.0.
+        """
+        folder_path = Path(folder_path)
+        if folder_path.exists():
+            shutil.rmtree(folder_path)
+        shutil.copytree(self.folder_path, folder_path)
+
+        event_names, flags = CsvRecordingExtractor.discover_events_and_flags(self.folder_path)
+
+        first_data_timestamp = None
+        for event_name, flag in zip(event_names, flags):
+            if flag == "data_csv":
+                dataframe = pd.read_csv(Path(self.folder_path) / f"{event_name}.csv", index_col=False)
+                first_data_timestamp = dataframe["timestamps"].iloc[0]
+                break
+
+        cutoff_timestamp = first_data_timestamp + duration_in_seconds
+
+        for event_name, flag in zip(event_names, flags):
+            csv_path = folder_path / f"{event_name}.csv"
+            dataframe = pd.read_csv(csv_path, index_col=False)
+            dataframe = dataframe[dataframe["timestamps"] <= cutoff_timestamp]
+            dataframe.to_csv(csv_path, index=False)
+
     def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None:
-        for S in output_dicts:
-            working_dict = copy.deepcopy(S)
-            event = working_dict.pop("storename")
-            df = pd.DataFrame.from_dict(working_dict)
-            self._save_to_hdf5(df=df, event=event, outputPath=outputPath)
+        for output_dict in output_dicts:
+            storename = output_dict["storename"]
+            for key, value in output_dict.items():
+                if key == "storename":
+                    continue
+                self._write_hdf5(value, storename, outputPath, key)
diff --git a/src/guppy/extractors/doric_recording_extractor.py b/src/guppy/extractors/doric_recording_extractor.py
@@ -2,7 +2,9 @@
 import logging
 import os
 import re
+import shutil
 import warnings
+from pathlib import Path
 from typing import Any
 
 import h5py
@@ -293,6 +295,121 @@ def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]:
 
         return output_dicts
 
+    def stub(self, *, folder_path, duration_in_seconds=1.0):
+        """
+        Create a stubbed copy of the Doric folder truncated to a short duration.
+
+        Copies the source folder to `folder_path`, then rewrites the Doric data
+        file so that only the first `duration_in_seconds` of recorded data are
+        retained. Supports V1 (.doric HDF5) and CSV (doric_csv) formats. V6
+        HDF5 format is not yet supported.
+
+        Parameters
+        ----------
+        folder_path : str or Path
+            Destination directory for the stubbed folder. Created if it does
+            not exist; overwritten if it already exists.
+        duration_in_seconds : float, optional
+            Approximate duration of data to retain in seconds. Default is 1.0.
+        """
+        folder_path = Path(folder_path)
+        if folder_path.exists():
+            shutil.rmtree(folder_path)
+        shutil.copytree(self.folder_path, folder_path)
+
+        flag = self._check_doric()
+        if flag == "doric_doric":
+            self._stub_doric_hdf5(folder_path=folder_path, duration_in_seconds=duration_in_seconds)
+        elif flag == "doric_csv":
+            self._stub_doric_csv(folder_path=folder_path, duration_in_seconds=duration_in_seconds)
+
+    def _stub_doric_hdf5(self, *, folder_path, duration_in_seconds):
+        doric_paths = glob.glob(os.path.join(folder_path, "*.doric"))
+        doric_path = doric_paths[0]
+
+        with h5py.File(doric_path, "r") as source_file:
+            if "Traces" in list(source_file.keys()):
+                temporary_path = self._stub_doric_hdf5_v1(
+                    source_file=source_file, doric_path=doric_path, duration_in_seconds=duration_in_seconds
+                )
+            else:
+                temporary_path = self._stub_doric_hdf5_v6(
+                    source_file=source_file, doric_path=doric_path, duration_in_seconds=duration_in_seconds
+                )
+
+        # Replace after closing source_file so Windows does not raise PermissionError on open files.
+        os.replace(temporary_path, doric_path)
+
+    def _stub_doric_hdf5_v1(self, *, source_file, doric_path, duration_in_seconds):
+        timestamps = np.array(source_file["Traces"]["Console"]["Time(s)"]["Console_time(s)"])
+        cutoff_timestamp = timestamps[0] + duration_in_seconds
+        cutoff_index = int(np.searchsorted(timestamps, cutoff_timestamp, side="right"))
+
+        # Only keep channels needed by this extractor instance, not all channels in the file.
+        # This avoids copying irrelevant channels that inflate stub file size.
+        channel_keys = list(self._event_name_to_event_type.keys())
+
+        channel_data = {}
+        for key in channel_keys:
+            channel_data[key] = np.array(source_file["Traces"]["Console"][key][key])
+
+        temporary_path = doric_path + ".tmp"
+        with h5py.File(temporary_path, "w") as destination_file:
+            console = destination_file.require_group("Traces/Console")
+            time_group = console.require_group("Time(s)")
+            time_group.create_dataset("Console_time(s)", data=timestamps[:cutoff_index], compression="gzip")
+            for key in channel_keys:
+                channel_group = console.require_group(key)
+                channel_group.create_dataset(key, data=channel_data[key][:cutoff_index], compression="gzip")
+
+        return temporary_path
+
+    def _stub_doric_hdf5_v6(self, *, source_file, doric_path, duration_in_seconds):
+        temporary_path = doric_path + ".tmp"
+        with h5py.File(temporary_path, "w") as destination_file:
+            if "Configurations" in source_file:
+                source_file.copy("Configurations", destination_file)
+            self._copy_group_truncated(
+                source_group=source_file["DataAcquisition"],
+                destination_group=destination_file.require_group("DataAcquisition"),
+                duration_in_seconds=duration_in_seconds,
+            )
+        return temporary_path
+
+    def _copy_group_truncated(self, *, source_group, destination_group, duration_in_seconds):
+        if "Time" in source_group:
+            time_data = source_group["Time"][:]
+            cutoff_index = int(np.searchsorted(time_data, time_data[0] + duration_in_seconds, side="right"))
+            for key in source_group:
+                destination_group.create_dataset(key, data=source_group[key][:cutoff_index])
+        else:
+            for key in source_group:
+                item = source_group[key]
+                if isinstance(item, h5py.Group):
+                    self._copy_group_truncated(
+                        source_group=item,
+                        destination_group=destination_group.require_group(key),
+                        duration_in_seconds=duration_in_seconds,
+                    )
+                else:
+                    destination_group.create_dataset(key, data=item[:])
+
+    def _stub_doric_csv(self, *, folder_path, duration_in_seconds):
+        csv_paths = glob.glob(os.path.join(folder_path, "*.csv"))
+        csv_path = csv_paths[0]
+
+        # Row 0 is the channel descriptor row; row 1 is the column name row (header=1 skips both)
+        header_rows = pd.read_csv(csv_path, header=None, nrows=2, index_col=False, dtype=str)
+        dataframe = pd.read_csv(csv_path, header=1, index_col=False)
+        dataframe = dataframe.dropna(axis=1, how="all")
+
+        cutoff_timestamp = dataframe["Time(s)"].iloc[0] + duration_in_seconds
+        dataframe = dataframe[dataframe["Time(s)"] <= cutoff_timestamp]
+
+        with open(csv_path, "w") as file:
+            header_rows.to_csv(file, index=False, header=False)
+            dataframe.to_csv(file, index=False, header=False)
+
     def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None:
         for S in output_dicts:
             storename = S["storename"]