Source code for IS2view.io

"""
io.py
Written by Tyler Sutterley (06/2024)
Utilities for reading gridded ICESat-2 files using rasterio and xarray

PYTHON DEPENDENCIES:
    h5netcdf: Pythonic interface to netCDF4 via h5py
        https://h5netcdf.org/
    numpy: Scientific Computing Tools For Python
        https://numpy.org
        https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
    rasterio: Access to geospatial raster data
        https://github.com/rasterio/rasterio
        https://rasterio.readthedocs.io
    rioxarray: geospatial xarray extension powered by rasterio
        https://github.com/corteva/rioxarray
    xarray: N-D labeled arrays and datasets in Python
        https://docs.xarray.dev/en/stable/

UPDATE HISTORY:
    Updated 06/2024: use wrapper to importlib for optional dependencies
    Updated 10/2023: use dask.delayed to read multiple files in parallel
    Updated 08/2023: use xarray h5netcdf to read files streaming from s3
        add open_dataset function for opening multiple granules
        add merging of datasets in preparation for Release-3 data
    Updated 07/2023: use logging instead of warnings for import attempts
    Written 11/2022
"""

from __future__ import annotations
import os
from IS2view.utilities import import_dependency

# attempt imports
rioxarray = import_dependency("rioxarray")
rioxarray.merge = import_dependency("rioxarray.merge")
dask = import_dependency("dask")
xr = import_dependency("xarray")

# set environmental variable for anonymous s3 access
os.environ["AWS_NO_SIGN_REQUEST"] = "YES"

# default engine for xarray
_default_engine = dict(nc="h5netcdf", zarr="zarr")


[docs] def open_dataset( granule, group: str | None = None, format: str = "nc", parallel: bool = True, **kwargs, ): """ Reads and optionally merges gridded ICESat-2 files Parameters ---------- granule: str or list presigned url or path for granule(s) as a s3fs object group: str or NoneType, default None Data group to read format: str, default 'nc' Data format to read parallel: bool, default True Open files in parallel using ``dask.delayed`` kwargs: dict Keyword arguments to pass to ``xarray`` reader Returns ------- ds: object ``xarray`` dataset """ # check if merging multiple granules if isinstance(granule, list): # merge multiple granules datasets = [] closers = [] if parallel: opener = dask.delayed(from_file) getattrs = dask.delayed(getattr) else: opener = from_file getattrs = getattr # read each granule and append to list for g in granule: datasets.append(opener(g, group=group, format=format, **kwargs)) closers = [getattrs(ds, "_close") for ds in datasets] # read datasets as dask arrays if parallel: datasets, closers = dask.compute(datasets, closers) # merge datasets ds = rioxarray.merge.merge_datasets(datasets) else: # read a single granule ds = from_file(granule, group=group, format=format, **kwargs) # return the dataset return ds
[docs] def from_file(granule, group: str | None = None, format: str = "nc", **kwargs): """ Reads a gridded ICESat-2 file using ``rioxarray`` or ``xarray`` Parameters ---------- granule: str presigned url or path for granule group: str or NoneType, default None Data group to read format: str, default 'nc' Data format to read kwargs: dict Keyword arguments to pass to ``xarray`` reader Returns ------- ds: object ``xarray`` dataset """ # set default engine kwargs.setdefault("engine", _default_engine[format]) if isinstance(granule, str) and format in ("nc",): ds = from_rasterio(granule, group=group, **kwargs) else: # read a single granule ds = from_xarray(granule, group=group, **kwargs) # return the dataset return ds
[docs] def from_rasterio(granule, group: str | None = None, **kwargs): """ Reads a gridded ICESat-2 file using ``rioxarray`` Parameters ---------- granule: str presigned url or path for granule group: str or NoneType, default None Data group to read kwargs: dict Keyword arguments to pass to ``rioxarray`` Returns ------- ds: object ``xarray`` dataset """ ds = rioxarray.open_rasterio(granule, group=group, masked=True, **kwargs) return ds
[docs] def from_xarray( granule, group: str | None = None, engine: str = "h5netcdf", **kwargs ): """ Reads a gridded ICESat-2 file using ``xarray`` Parameters ---------- granule: str presigned url or path for granule group: str or NoneType, default None Data group to read engine: str, default 'h5netcdf' Engine to use when reading files kwargs: dict Keyword arguments to pass to ``xarray`` Returns ------- ds: object ``xarray`` dataset """ kwargs.setdefault("variable", []) variable = kwargs.pop("variable") # read xarray dataset ds = xr.open_dataset( granule, group=group, engine=engine, chunks="auto", decode_cf=True, mask_and_scale=True, decode_times=False, concat_characters=True, decode_coords=True, overwrite_encoded_chunks=False, **kwargs, ) # set the coordinate reference system ds.rio.write_crs(ds.Polar_Stereographic.attrs["crs_wkt"], inplace=True) # reduce xarray dataset to specific variables if any(variable): ds = ds[variable] # flip orientation of y dimension ds = ds.isel(y=slice(None, None, -1)) return ds