Skip to content

Commit da20913

Browse files
Parquet reading (#20)
* Initial * Next AI commit * Read from domain names * Remove load_channels * Update python test versions * Just 3.14 * Copilot feedback * Incorrect parameter name * Use gather and make more Pythonic
1 parent c4cec08 commit da20913

7 files changed

Lines changed: 144 additions & 94 deletions

File tree

azure-pipelines.yml

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,21 @@ pool:
1010
vmImage: 'ubuntu-latest'
1111

1212
variables:
13-
python.buildVersion: '3.8'
14-
15-
major: 8
13+
python.buildVersion: '3.10'
14+
isPullRequest: ${{ eq(variables['Build.Reason'], 'PullRequest') }}
15+
major: 9
1616
minor: $[counter(variables.major, 0)]
17-
buildVersion: $[format('{0}.{1}', variables.major, variables.minor)]
18-
19-
isPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
17+
${{ if eq(variables['isPullRequest'], 'True') }}:
18+
buildVersion: $[format('{0}.{1}.dev1', variables.major, variables.minor)]
19+
${{ if eq(variables['isPullRequest'], 'False') }}:
20+
buildVersion: $[format('{0}.{1}', variables.major, variables.minor)]
2021

2122
name: '$(buildVersion)'
2223

2324
jobs:
2425
- job: Test
2526
strategy:
2627
matrix:
27-
Python38:
28-
python.version: '3.8'
29-
Python39:
30-
python.version: '3.9'
3128
Python310:
3229
python.version: '3.10'
3330
Python311:
@@ -36,6 +33,8 @@ jobs:
3633
python.version: '3.12'
3734
Python313:
3835
python.version: '3.13'
36+
Python314:
37+
python.version: '3.14'
3938

4039
steps:
4140
- task: UsePythonVersion@0
@@ -65,7 +64,6 @@ jobs:
6564
6665
- job: Build
6766
dependsOn: Test
68-
condition: and(succeeded(), eq(variables['isPullRequest'], 'False'))
6967

7068
steps:
7169
- task: UsePythonVersion@0

canopy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
from canopy.update_config import update_config
4343
from canopy.load_study import load_study
4444
from canopy.load_study_job import load_study_job
45-
from canopy.load_channel import load_channel
45+
from canopy.load_channels import load_channels
4646
from canopy.load_vector_metadata import load_vector_metadata
4747
from canopy.get_study_document import get_study_document
4848
from canopy.job_count_to_simulation_count import job_count_to_simulation_count

canopy/load_channel.py

Lines changed: 0 additions & 67 deletions
This file was deleted.

canopy/load_channels.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import polars as pl
2+
from typing import List, Optional
3+
import numpy as np
4+
import pandas as pd
5+
import canopy
6+
import logging
7+
import asyncio
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
async def load_channels(
13+
session: canopy.Session,
14+
job_access_information: canopy.openapi.BlobAccessInformation,
15+
sim_type: str,
16+
channel_names: List[str],
17+
vector_metadata: Optional[pd.DataFrame] = None,
18+
semaphore: Optional[asyncio.Semaphore] = None) -> List[Optional[canopy.LoadedChannel]]:
19+
sim_type = canopy.ensure_sim_type_string(sim_type)
20+
21+
if semaphore is None:
22+
semaphore = asyncio.Semaphore(session.default_blob_storage_concurrency)
23+
24+
if vector_metadata is None:
25+
vector_metadata = await canopy.load_vector_metadata(session, job_access_information, sim_type)
26+
27+
if vector_metadata is None:
28+
return [None] * len(channel_names)
29+
30+
# First attempt to load from parquet if available
31+
# We group channels by their x-domain to load from the correct parquet files.
32+
parquet_results = {}
33+
34+
channels_by_x_domain = {}
35+
for name in channel_names:
36+
if name in vector_metadata.index:
37+
x_domain = vector_metadata.at[name, 'xDomainName']
38+
if pd.isna(x_domain) or not x_domain:
39+
continue
40+
41+
if x_domain not in channels_by_x_domain:
42+
channels_by_x_domain[x_domain] = []
43+
channels_by_x_domain[x_domain].append(name)
44+
45+
# Try loading from parquet for each x-domain
46+
for x_domain, domain_channels in channels_by_x_domain.items():
47+
loaded_from_parquet = await _try_load_channels_from_parquet(
48+
job_access_information,
49+
sim_type,
50+
x_domain,
51+
domain_channels,
52+
vector_metadata)
53+
54+
if loaded_from_parquet:
55+
for channel in loaded_from_parquet:
56+
if channel is not None:
57+
parquet_results[channel.name] = channel
58+
59+
async def _load_channel(channel_name: str) -> Optional[canopy.LoadedChannel]:
60+
if channel_name in parquet_results:
61+
return parquet_results[channel_name]
62+
63+
async with semaphore:
64+
if channel_name not in vector_metadata.index:
65+
logger.debug('Channel not found: %s', channel_name)
66+
return None
67+
68+
channel_metadata = vector_metadata.xs(channel_name)
69+
70+
points_count: int = channel_metadata['NPtsInChannel']
71+
units: str = channel_metadata['units']
72+
73+
file_name = ''.join([sim_type, '_', channel_name, '.bin'])
74+
channel_url = ''.join([job_access_information.url, file_name, job_access_information.access_signature])
75+
76+
channel_bytes: Optional[bytes] = await session.try_load_bytes(
77+
channel_url,
78+
f'"{file_name}" from "{job_access_information.url}"')
79+
80+
if channel_bytes is None:
81+
return None
82+
83+
if points_count * 4 == len(channel_bytes):
84+
data_type = np.float32
85+
else:
86+
data_type = np.float64
87+
channel_data: np.array = np.frombuffer(channel_bytes, data_type)
88+
89+
loaded_channel = canopy.LoadedChannel(channel_name, units, channel_data)
90+
return loaded_channel
91+
92+
return await asyncio.gather(*[_load_channel(name) for name in channel_names])
93+
94+
async def _try_load_channels_from_parquet(
95+
job_access_information: canopy.openapi.BlobAccessInformation,
96+
sim_type: str,
97+
x_domain: str,
98+
channel_names: List[str],
99+
vector_metadata: pd.DataFrame) -> Optional[List[Optional[canopy.LoadedChannel]]]:
100+
file_name = f'{sim_type}_{x_domain}_VectorResults.parquet'
101+
url = f'{job_access_information.url}{file_name}{job_access_information.access_signature}'
102+
103+
try:
104+
# We'll use a single scan for all requested channels that exist in metadata.
105+
valid_channels = [name for name in channel_names if name in vector_metadata.index]
106+
if not valid_channels:
107+
return None
108+
109+
# Fetch all required columns in one go
110+
df: pl.DataFrame = await (pl.scan_parquet(url, parallel="columns", storage_options={ "max_retries": 1, "retry_timeout_ms": 100 })
111+
.select(valid_channels)
112+
.collect_async())
113+
114+
return [
115+
canopy.LoadedChannel(name, str(vector_metadata.at[name, "units"]), df.get_column(name).to_numpy())
116+
if name in valid_channels
117+
else None
118+
for name in channel_names
119+
]
120+
except Exception as e:
121+
logger.debug(f"Failed to load channels from parquet {file_name}: {e}")
122+
return None

canopy/load_study_job.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,20 +84,15 @@ async def load_study_job(
8484

8585
if vector_metadata is not None and channel_names is not None:
8686
channels_semaphore = asyncio.Semaphore(session.default_blob_storage_concurrency)
87-
tasks: List[Future[Optional[canopy.LoadedChannel]]] = []
88-
for channel_name in channel_names:
89-
loaded_channel_task = asyncio.ensure_future(canopy.load_channel(
90-
session,
91-
job_access_information,
92-
sim_type,
93-
channel_name,
94-
vector_metadata=vector_metadata,
95-
semaphore=channels_semaphore))
96-
97-
tasks.append(loaded_channel_task)
98-
99-
for channel_name, task in zip(channel_names, tasks):
100-
loaded_channel = await task
87+
loaded_channels = await canopy.load_channels(
88+
session,
89+
job_access_information,
90+
sim_type,
91+
channel_names,
92+
vector_metadata=vector_metadata,
93+
semaphore=channels_semaphore)
94+
95+
for channel_name, loaded_channel in zip(channel_names, loaded_channels):
10196
if loaded_channel is not None:
10297
# Convert to a series, which allows the DataFrame to pad with NaNs if
10398
# channels happen to be different lengths.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ dependencies = [
2323
"urllib3>=1.23",
2424
"pandas>=0.25.1",
2525
"aiohttp",
26-
"munch"
26+
"munch",
27+
"polars"
2728
]
2829

2930
[tool.setuptools]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ urllib3 >= 1.15.1
77
pandas >= 0.25.1
88
aiohttp
99
munch
10+
polars
1011

1112
pytest-asyncio
1213
ipykernel

0 commit comments

Comments
 (0)