Source code for IS2view.io

"""
io.py
Written by Tyler Sutterley (06/2024)
Utilities for reading gridded ICESat-2 files using rasterio and xarray

PYTHON DEPENDENCIES:
    h5netcdf: Pythonic interface to netCDF4 via h5py
        https://h5netcdf.org/
    numpy: Scientific Computing Tools For Python
        https://numpy.org
        https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
    rasterio: Access to geospatial raster data
        https://github.com/rasterio/rasterio
        https://rasterio.readthedocs.io
    rioxarray: geospatial xarray extension powered by rasterio
        https://github.com/corteva/rioxarray
    xarray: N-D labeled arrays and datasets in Python
        https://docs.xarray.dev/en/stable/

UPDATE HISTORY:
    Updated 06/2024: use wrapper to importlib for optional dependencies
    Updated 10/2023: use dask.delayed to read multiple files in parallel
    Updated 08/2023: use xarray h5netcdf to read files streaming from s3
        add open_dataset function for opening multiple granules
        add merging of datasets in preparation for Release-3 data
    Updated 07/2023: use logging instead of warnings for import attempts
    Written 11/2022
"""
from __future__ import annotations
import os
from IS2view.utilities import import_dependency

# attempt imports
rioxarray = import_dependency('rioxarray')
rioxarray.merge = import_dependency('rioxarray.merge')
dask = import_dependency('dask')
xr = import_dependency('xarray')

# set environmental variable for anonymous s3 access
os.environ['AWS_NO_SIGN_REQUEST'] = 'YES'

# default engine for xarray
_default_engine = dict(nc='h5netcdf', zarr='zarr')


[docs]
def open_dataset(granule,
        group: str | None = None,
        format: str = 'nc',
        parallel: bool = True,
        **kwargs
    ):
    """
    Reads and optionally merges gridded ICESat-2 files

    Parameters
    ----------
    granule: str or list
        presigned url or path for granule(s) as a s3fs object
    group: str or NoneType, default None
        Data group to read
    format: str, default 'nc'
        Data format to read
    parallel: bool, default True
        Open files in parallel using ``dask.delayed``
    kwargs: dict
        Keyword arguments to pass to ``xarray`` reader

    Returns
    -------
    ds: object
        ``xarray`` dataset
    """
    # check if merging multiple granules
    if isinstance(granule, list):
        # merge multiple granules
        datasets = []
        closers = []
        if parallel:
            opener = dask.delayed(from_file)
            getattrs = dask.delayed(getattr)
        else:
            opener = from_file
            getattrs = getattr
        # read each granule and append to list
        for g in granule:
            datasets.append(opener(g,
                group=group,
                format=format,
                **kwargs)
            )
        closers = [getattrs(ds, "_close") for ds in datasets]
        # read datasets as dask arrays
        if parallel:
            datasets, closers = dask.compute(datasets, closers)
        # merge datasets
        ds = rioxarray.merge.merge_datasets(datasets)
    else:
        # read a single granule
        ds = from_file(granule,
            group=group,
            format=format,
            **kwargs
        )
    # return the dataset
    return ds



[docs]
def from_file(granule,
        group: str | None = None,
        format: str = 'nc',
        **kwargs
    ):
    """
    Reads a gridded ICESat-2 file using ``rioxarray`` or ``xarray``

    Parameters
    ----------
    granule: str
        presigned url or path for granule
    group: str or NoneType, default None
        Data group to read
    format: str, default 'nc'
        Data format to read
    kwargs: dict
        Keyword arguments to pass to ``xarray`` reader

    Returns
    -------
    ds: object
        ``xarray`` dataset
    """
    # set default engine
    kwargs.setdefault('engine', _default_engine[format])
    if isinstance(granule, str) and format in ('nc',):
        ds = from_rasterio(granule,
            group=group,
            **kwargs
        )
    else:
        # read a single granule
        ds = from_xarray(granule,
            group=group,
            **kwargs
        )
    # return the dataset
    return ds



[docs]
def from_rasterio(granule,
        group: str | None = None,
        **kwargs
    ):
    """
    Reads a gridded ICESat-2 file using ``rioxarray``

    Parameters
    ----------
    granule: str
        presigned url or path for granule
    group: str or NoneType, default None
        Data group to read
    kwargs: dict
        Keyword arguments to pass to ``rioxarray``

    Returns
    -------
    ds: object
        ``xarray`` dataset
    """
    ds = rioxarray.open_rasterio(granule,
        group=group,
        masked=True,
        **kwargs
    )
    return ds



[docs]
def from_xarray(granule,
        group: str | None = None,
        engine: str = 'h5netcdf',
        **kwargs
    ):
    """
    Reads a gridded ICESat-2 file using ``xarray``

    Parameters
    ----------
    granule: str
        presigned url or path for granule
    group: str or NoneType, default None
        Data group to read
    engine: str, default 'h5netcdf'
        Engine to use when reading files
    kwargs: dict
        Keyword arguments to pass to ``xarray``

    Returns
    -------
    ds: object
        ``xarray`` dataset
    """
    kwargs.setdefault('variable', [])
    variable = kwargs.pop('variable')
    # read xarray dataset
    ds = xr.open_dataset(granule,
        group=group,
        engine=engine,
        chunks='auto',
        decode_cf=True,
        mask_and_scale=True,
        decode_times=False,
        concat_characters=True,
        decode_coords=True,
        overwrite_encoded_chunks=False,
        **kwargs
    )
    # set the coordinate reference system
    ds.rio.write_crs(ds.Polar_Stereographic.attrs['crs_wkt'], inplace=True)
    # reduce xarray dataset to specific variables
    if any(variable):
        ds = ds[variable]
    # flip orientation of y dimension
    ds = ds.isel(y=slice(None, None, -1))
    return ds