-
-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: Add test for checking physical limits and zeroes in NWP data #… #340
base: main
Are you sure you want to change the base?
Changes from 6 commits
3ee287c
1e2df80
8105b91
1eafe49
d5bc6cf
d8cfa9d
5e68173
466b710
692500c
0667bab
246d898
55627eb
7ba254d
c6ee33d
d0c4f6f
19050c7
3fe89fc
ace0259
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,69 +41,41 @@ def __init__( | |
self.zarr_path = zarr_path | ||
self.check_for_zeros = check_for_zeros | ||
self.check_physical_limits = check_physical_limits | ||
|
||
#limits for NWP data in accordance with https://huggingface.co/openclimatefix/pvnet_uk_region/blob/main/data_config.yaml | ||
self.limits = { | ||
"temperature": (-100, 60), # Celsius | ||
"specific_humidity": (0, 0.03), # kg/kg | ||
"relative_humidity": (0, 100), # Percentage | ||
"pressure": (0, 1100), # hPa (sea level pressure) | ||
"u_wind": (-200, 200), # m/s | ||
"v_wind": (-200, 200), # m/s | ||
"geopotential": (0, 100000), # m^2/s^2 | ||
"total_precipitation": (0, 2000), # mm/day | ||
"convective_precipitation": (0, 1000), # mm/day | ||
"snowfall": (0, 1000), # mm water equivalent/day | ||
"graupel": (0, 500), # mm water equivalent/day | ||
"cloud_cover": (0, 100), # Percentage | ||
"surface_temperature": (-90, 60), # Celsius | ||
"sea_surface_temperature": (-2, 35), # Celsius | ||
"soil_temperature": (-50, 60), # Celsius | ||
"soil_moisture": (0, 1), # m^3/m^3 | ||
"visibility": (0, 100000), # meters | ||
"wind_gust": (0, 250), # m/s | ||
"solar_radiation": (0, 1500), # W/m^2 | ||
"longwave_radiation": (0, 750), # W/m^2 | ||
"evaporation": (0, 50), # mm/day | ||
"potential_evaporation": (0, 100), # mm/day | ||
"boundary_layer_height": (0, 5000), # meters | ||
"cape": (0, 10000), # J/kg | ||
"cin": (0, 1000), # J/kg | ||
"lifted_index": (-15, 15), # Kelvin | ||
"total_column_water": (0, 100), # kg/m^2 | ||
"ozone_concentration": (0, 1000), # Dobson units | ||
"dew_point_temperature": (-100, 35), # Celsius | ||
"wet_bulb_temperature": (-100, 35), # Celsius | ||
"potential_temperature": (0, 1000), # Kelvin | ||
"equivalent_potential_temperature": (0, 1000), # Kelvin | ||
"vorticity": (-1e-3, 1e-3), # 1/s | ||
"divergence": (-1e-3, 1e-3), # 1/s | ||
"vertical_velocity": (-50, 50), # m/s | ||
"cloud_base_height": (0, 20000), # meters | ||
"cloud_top_height": (0, 20000), # meters | ||
"cloud_water_content": (0, 5), # g/kg | ||
"ice_water_content": (0, 5), # g/kg | ||
"surface_roughness": (0, 10), # meters | ||
"albedo": (0, 1), # dimensionless | ||
"friction_velocity": (0, 5), # m/s | ||
"sensible_heat_flux": (-500, 500), # W/m^2 | ||
"latent_heat_flux": (-500, 500), # W/m^2 | ||
"momentum_flux": (-10, 10), # N/m^2 | ||
"surface_pressure": (300, 1100), # hPa | ||
"mean_sea_level_pressure": (870, 1090), # hPa | ||
"tropopause_pressure": (50, 500), # hPa | ||
"tropopause_temperature": (-100, 0), # Celsius | ||
"precipitable_water": (0, 100), # mm | ||
"total_cloud_cover": (0, 100), # Percentage | ||
"low_cloud_cover": (0, 100), # Percentage | ||
"medium_cloud_cover": (0, 100), # Percentage | ||
"high_cloud_cover": (0, 100), # Percentage | ||
"convective_available_potential_energy": (0, 10000), # J/kg | ||
"convective_inhibition": (0, 1000), # J/kg | ||
"storm_relative_helicity": (-1000, 1000), # m^2/s^2 | ||
"bulk_richardson_number": (-10, 10), # dimensionless | ||
"lifted_condensation_level": (0, 5000), # meters | ||
"level_of_free_convection": (0, 20000), # meters | ||
"equilibrium_level": (0, 20000), # meters | ||
"UKV": (250, 330), # UKV specific | ||
"t2m": (173.15, 333.15), # Temperature in Kelvin (-100°C to 60°C) | ||
"dswrf": (0, 1500), # Downward short-wave radiation flux, W/m^2 | ||
"dlwrf": (0, 750), # Downward long-wave radiation flux, W/m^2 | ||
"hcc": (0, 100), # High cloud cover, % | ||
"mcc": (0, 100), # Medium cloud cover, % | ||
"lcc": (0, 100), # Low cloud cover, % | ||
"tcc": (0, 100), # Total cloud cover, % | ||
"sde": (0, 1000), # Snowfall depth, meters | ||
"sr": (0, 10), # Surface roughness, meters | ||
"duvrs": (0, 500), # Direct UV radiation at surface, W/m^2 (positive values only) | ||
"u10": (-200, 200), # U component of 10m wind, m/s | ||
"v10": (-200, 200), # V component of 10m wind, m/s | ||
|
||
# UKV NWP channels (additional to ECMWF) | ||
"prate": (0, 2000), # Precipitation rate, , kg/m^2/s (equivalent to 0-2000 mm/day) | ||
"r": (0, 100), # Relative humidity, % | ||
"si10": (0, 250), # Wind speed at 10m, m/s | ||
"t": (173.15, 333.15), # Temperature in Kelvin (-100°C to 60°C) | ||
glitch401 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"vis": (0, 100000), # Visibility, meters | ||
|
||
# Satellite channels (no direct mapping to physical limits, using placeholder values) | ||
"IR_016": (0, 1), # Infrared channel | ||
"IR_039": (0, 1), # Infrared channel | ||
"IR_087": (0, 1), # Infrared channel | ||
"IR_097": (0, 1), # Infrared channel | ||
"IR_108": (0, 1), # Infrared channel | ||
"IR_120": (0, 1), # Infrared channel | ||
"IR_134": (0, 1), # Infrared channel | ||
"VIS006": (0, 1), # Visible channel | ||
"VIS008": (0, 1), # Visible channel | ||
"WV_062": (0, 1), # Water vapor channel | ||
"WV_073": (0, 1), # Water vapor channel | ||
glitch401 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
logger.info(f"Using {provider.lower()}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. very much just a suggestion, but it would be nice to have some control over which variables receive the checks. Intuitively, that should probably be possible by just passing a list of keys to be checked instead of |
||
if provider.lower() == "ukv": | ||
|
@@ -138,22 +110,31 @@ def check_if_zeros(self, nwp: Union[xr.DataArray, xr.Dataset]): | |
"""Checks if the NWP data contains zeros""" | ||
if isinstance(nwp, xr.DataArray): | ||
if (nwp.values == 0).any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like it will not be performed lazily (as in, this will load the whole dataArray into memory to check the values), which we really want to avoid in this place because at this point the arrays we operate on are often massive There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've made some changes to accommodate lazy loading. Have leveraged Dask arrays, as its often used with xarray for the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good way to go about it! There is a danger that some of our data might not fit anyway, but since it can be turned on and off that's fine. I was wondering if it's worth exploring implementing this check downstream, somewhere after spacial and temporal crop and before normalisation, so that it operates on samples instead? And then maybe skip ones with too many zeroes/nans/out of physical bounds values and give a userWarning/log info of how many were skipped as a proxy for understanding how much of the data is corrupted. Thoughts @Sukh-P @peterdudfield? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that idea, less chance of a chance of running into memory issues by loading a chunk, I guess the only draw back would be doing processing on some data you are going to chuck anyway but in this case that processing gets it down to a more manageable size There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @AUdaltsova I'm trying to understand if this is acceptable w.r.t the scope of this PR? 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @glitch401 might be! @peterdudfield happy to merge this then? :) |
||
raise ValueError("NWP DataArray contains zeros") | ||
raise ValueError( | ||
f"NWP DataArray contains{(nwp.values == 0).sum()*100/nwp.values.size}% zeros" | ||
) | ||
if isinstance(nwp, xr.Dataset): | ||
for var in nwp: | ||
if (nwp[var].values == 0).any(): | ||
raise ValueError(f"NWP Dataset variable{var} contains zeros") | ||
raise ValueError( | ||
f"NWP Dataset variable{var} " | ||
f"contains {(nwp[var].values == 0).sum()*100/nwp[var].values.size}% zeros" | ||
) | ||
|
||
def check_if_physical_limits(self, nwp: Union[xr.DataArray, xr.Dataset]): | ||
"""Checks if the NWP data is within physical limits""" | ||
if isinstance(nwp, xr.DataArray): | ||
var_name = nwp.name | ||
var_name = nwp.channel.values[0] | ||
peterdudfield marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if var_name in self.limits: | ||
lower, upper = self.limits[var_name] | ||
if (nwp < lower).any() or (nwp > upper).any(): | ||
raise ValueError(f"NWP data {var_name} is outside physical limits") | ||
raise ValueError( | ||
f"NWP data {var_name} is outside physical limits: ({lower},{upper})" | ||
) | ||
elif isinstance(nwp, xr.Dataset): | ||
for var_name, (lower, upper) in self.limits.items(): | ||
if var_name in nwp.variables: | ||
if var_name in nwp.channel: | ||
peterdudfield marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not ((nwp[var_name] >= lower).all() and (nwp[var_name] <= upper).all()): | ||
raise ValueError(f"NWP data {var_name} is outside physical limits") | ||
raise ValueError( | ||
f"NWP data {var_name} is outside physical limits: ({lower},{upper})" | ||
) |
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this is getting right into OCFs data model here but @devsjc has helped me understand that we have an internal naming convention that deviates slightly from some NWP providers e.g.
sr
actually maps todsrp
for us, not surface roughness. Obviously there would not be a way to know that as a contributor, so apologies for that. @peterdudfield FYIThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point! Can supply keys and then pull corresponding ranges out of consts maybe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you are right @Sukh-P , @peterdudfield did help me understand the conventions