Parquet reading (#20)

StevenGranados · web-flow · commit da2091305466 · 2026-04-27T10:40:47.000+01:00
* Initial

* Next AI commit

* Read from domain names

* Remove load_channels

* Update python test versions

* Just 3.14

* Copilot feedback

* Incorrect parameter name

* Use gather and make more Pythonic
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -10,24 +10,21 @@ pool:
   vmImage: 'ubuntu-latest'
 
 variables:
-  python.buildVersion: '3.8'
-
-  major: 8
+  python.buildVersion: '3.10'
+  isPullRequest: ${{ eq(variables['Build.Reason'], 'PullRequest') }}
+  major: 9
   minor: $[counter(variables.major, 0)]
-  buildVersion: $[format('{0}.{1}', variables.major, variables.minor)]
-
-  isPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
+  ${{ if eq(variables['isPullRequest'], 'True') }}:
+    buildVersion: $[format('{0}.{1}.dev1', variables.major, variables.minor)]
+  ${{ if eq(variables['isPullRequest'], 'False') }}:
+    buildVersion: $[format('{0}.{1}', variables.major, variables.minor)]
 
 name: '$(buildVersion)'
 
 jobs:
   - job: Test
     strategy:
       matrix:
-        Python38:
-          python.version: '3.8'
-        Python39:
-          python.version: '3.9'
         Python310:
           python.version: '3.10'
         Python311:
@@ -36,6 +33,8 @@ jobs:
           python.version: '3.12'
         Python313:
           python.version: '3.13'
+        Python314:
+          python.version: '3.14'
 
     steps:
     - task: UsePythonVersion@0
@@ -65,7 +64,6 @@ jobs:
 
   - job: Build
     dependsOn: Test
-    condition: and(succeeded(), eq(variables['isPullRequest'], 'False'))
 
     steps:
     - task: UsePythonVersion@0
diff --git a/canopy/__init__.py b/canopy/__init__.py
@@ -42,7 +42,7 @@
 from canopy.update_config import update_config
 from canopy.load_study import load_study
 from canopy.load_study_job import load_study_job
-from canopy.load_channel import load_channel
+from canopy.load_channels import load_channels
 from canopy.load_vector_metadata import load_vector_metadata
 from canopy.get_study_document import get_study_document
 from canopy.job_count_to_simulation_count import job_count_to_simulation_count
diff --git a/canopy/load_channel.py b/canopy/load_channel.py
diff --git a/canopy/load_channels.py b/canopy/load_channels.py
@@ -0,0 +1,122 @@
+import polars as pl
+from typing import List, Optional
+import numpy as np
+import pandas as pd
+import canopy
+import logging
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+
+async def load_channels(
+        session: canopy.Session,
+        job_access_information: canopy.openapi.BlobAccessInformation,
+        sim_type: str,
+        channel_names: List[str],
+        vector_metadata: Optional[pd.DataFrame] = None,
+        semaphore: Optional[asyncio.Semaphore] = None) -> List[Optional[canopy.LoadedChannel]]:
+    sim_type = canopy.ensure_sim_type_string(sim_type)
+
+    if semaphore is None:
+        semaphore = asyncio.Semaphore(session.default_blob_storage_concurrency)
+
+    if vector_metadata is None:
+        vector_metadata = await canopy.load_vector_metadata(session, job_access_information, sim_type)
+
+    if vector_metadata is None:
+        return [None] * len(channel_names)
+
+    # First attempt to load from parquet if available
+    # We group channels by their x-domain to load from the correct parquet files.
+    parquet_results = {}
+    
+    channels_by_x_domain = {}
+    for name in channel_names:
+        if name in vector_metadata.index:
+            x_domain = vector_metadata.at[name, 'xDomainName']
+            if pd.isna(x_domain) or not x_domain:
+                continue
+            
+            if x_domain not in channels_by_x_domain:
+                channels_by_x_domain[x_domain] = []
+            channels_by_x_domain[x_domain].append(name)
+
+    # Try loading from parquet for each x-domain
+    for x_domain, domain_channels in channels_by_x_domain.items():
+        loaded_from_parquet = await _try_load_channels_from_parquet(
+            job_access_information,
+            sim_type,
+            x_domain,
+            domain_channels,
+            vector_metadata)
+        
+        if loaded_from_parquet:
+            for channel in loaded_from_parquet:
+                if channel is not None:
+                    parquet_results[channel.name] = channel
+
+    async def _load_channel(channel_name: str) -> Optional[canopy.LoadedChannel]:
+        if channel_name in parquet_results:
+            return parquet_results[channel_name]
+
+        async with semaphore:
+            if channel_name not in vector_metadata.index:
+                logger.debug('Channel not found: %s', channel_name)
+                return None
+
+            channel_metadata = vector_metadata.xs(channel_name)
+
+            points_count: int = channel_metadata['NPtsInChannel']
+            units: str = channel_metadata['units']
+
+            file_name = ''.join([sim_type, '_', channel_name, '.bin'])
+            channel_url = ''.join([job_access_information.url, file_name, job_access_information.access_signature])
+
+            channel_bytes: Optional[bytes] = await session.try_load_bytes(
+                channel_url,
+                f'"{file_name}" from "{job_access_information.url}"')
+
+            if channel_bytes is None:
+                return None
+
+            if points_count * 4 == len(channel_bytes):
+                data_type = np.float32
+            else:
+                data_type = np.float64
+            channel_data: np.array = np.frombuffer(channel_bytes, data_type)
+
+            loaded_channel = canopy.LoadedChannel(channel_name, units, channel_data)
+            return loaded_channel
+        
+    return await asyncio.gather(*[_load_channel(name) for name in channel_names])
+
+async def _try_load_channels_from_parquet(
+        job_access_information: canopy.openapi.BlobAccessInformation,
+        sim_type: str,
+        x_domain: str,
+        channel_names: List[str],
+        vector_metadata: pd.DataFrame) -> Optional[List[Optional[canopy.LoadedChannel]]]:
+    file_name = f'{sim_type}_{x_domain}_VectorResults.parquet'
+    url = f'{job_access_information.url}{file_name}{job_access_information.access_signature}'
+
+    try:
+        # We'll use a single scan for all requested channels that exist in metadata.
+        valid_channels = [name for name in channel_names if name in vector_metadata.index]
+        if not valid_channels:
+            return None
+
+        # Fetch all required columns in one go
+        df: pl.DataFrame = await (pl.scan_parquet(url, parallel="columns", storage_options={ "max_retries": 1, "retry_timeout_ms": 100 })
+                                    .select(valid_channels)
+                                    .collect_async())
+        
+        return [
+            canopy.LoadedChannel(name, str(vector_metadata.at[name, "units"]), df.get_column(name).to_numpy())
+            if name in valid_channels
+            else None
+            for name in channel_names
+        ]
+    except Exception as e:
+        logger.debug(f"Failed to load channels from parquet {file_name}: {e}")
+        return None
diff --git a/canopy/load_study_job.py b/canopy/load_study_job.py
@@ -84,20 +84,15 @@ async def load_study_job(
 
             if vector_metadata is not None and channel_names is not None:
                 channels_semaphore = asyncio.Semaphore(session.default_blob_storage_concurrency)
-                tasks: List[Future[Optional[canopy.LoadedChannel]]] = []
-                for channel_name in channel_names:
-                    loaded_channel_task = asyncio.ensure_future(canopy.load_channel(
-                        session,
-                        job_access_information,
-                        sim_type,
-                        channel_name,
-                        vector_metadata=vector_metadata,
-                        semaphore=channels_semaphore))
-
-                    tasks.append(loaded_channel_task)
-
-                for channel_name, task in zip(channel_names, tasks):
-                    loaded_channel = await task
+                loaded_channels = await canopy.load_channels(
+                    session,
+                    job_access_information,
+                    sim_type,
+                    channel_names,
+                    vector_metadata=vector_metadata,
+                    semaphore=channels_semaphore)
+
+                for channel_name, loaded_channel in zip(channel_names, loaded_channels):
                     if loaded_channel is not None:
                         # Convert to a series, which allows the DataFrame to pad with NaNs if
                         # channels happen to be different lengths.
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,8 @@ dependencies = [
     "urllib3>=1.23",
     "pandas>=0.25.1",
     "aiohttp",
-    "munch"
+    "munch",
+    "polars"
 ]
 
 [tool.setuptools]
diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ urllib3 >= 1.15.1
 pandas >= 0.25.1
 aiohttp
 munch
+polars
 
 pytest-asyncio
 ipykernel

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,8 @@ dependencies = [`
`23`	`23`	`"urllib3>=1.23",`
`24`	`24`	`"pandas>=0.25.1",`
`25`	`25`	`"aiohttp",`
`26`		`- "munch"`
	`26`	`+ "munch",`
	`27`	`+ "polars"`
`27`	`28`	`]`
`28`	`29`
`29`	`30`	`[tool.setuptools]`