diff --git a/.gitignore b/.gitignore index 8ee614c..903ece6 100644 --- a/.gitignore +++ b/.gitignore @@ -178,3 +178,8 @@ bar test.py cutout.png *.out + +_build/ +? +?.* +~* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3d0cd6..48ed5f5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: - --line-length=120 - --fix - --exit-non-zero-on-fix - - --preview + - --preview - --exclude - 'dev/*.py' #- repo: https://github.com/pre-commit/mirrors-mypy @@ -46,3 +46,17 @@ repos: # - id: mypy # verbose: true # entry: bash -c 'mypy "$@" || true' -- +- repo: https://github.com/dzhu/rstfmt + rev: v0.0.14 + hooks: + - id: rstfmt + +# - repo: https://github.com/rstcheck/rstcheck +# rev: v6.2.0 +# hooks: +# - id: rstcheck +# args: +# - '--ignore-roles' +# - 'doc' +# - '--ignore-directives' +# - 'toctree' diff --git a/docs/_static/logo.png b/docs/_static/logo.png new file mode 100644 index 0000000..f78572e Binary files /dev/null and b/docs/_static/logo.png differ diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..9a2b3af --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,48 @@ +.wy-side-nav-search { + background-color: #f7f7f7; +} + +/*There is a clash between xarray notebook styles and readthedoc*/ + +.rst-content dl.xr-attrs dt { + all: revert; + font-size: 95%; + white-space: nowrap; +} + +.rst-content dl.xr-attrs dd { + font-size: 95%; +} + +.xr-wrap { + font-size: 85%; +} + +.wy-table-responsive table td, .wy-table-responsive table th { + white-space: inherit; +} + +/* +.wy-table-responsive table td, +.wy-table-responsive table th { + white-space: normal !important; + vertical-align: top !important; +} + +.wy-table-responsive { + margin-bottom: 24px; + max-width: 100%; + overflow: visible; +} */ + +/* Hide notebooks warnings */ +.nboutput .stderr { + display: none; +} + +/* +Set logo size +*/ +.wy-side-nav-search .wy-dropdown > a img.logo, .wy-side-nav-search > a img.logo { + width: 200px; +} diff --git a/docs/_templates/.gitkeep b/docs/_templates/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/apply-fmt.sh b/docs/apply-fmt.sh new file mode 100755 index 0000000..762a8e3 --- /dev/null +++ b/docs/apply-fmt.sh @@ -0,0 +1,5 @@ +: +for n in $(find . -name '*.rst') +do + rstfmt $n +done diff --git a/docs/check-index.sh b/docs/check-index.sh new file mode 100755 index 0000000..5bd2e4f --- /dev/null +++ b/docs/check-index.sh @@ -0,0 +1,7 @@ +: +# See https://github.com/vscode-restructuredtext/vscode-restructuredtext/issues/280 +for n in $(find . -name '*.rst') +do + m=$(echo $n | sed 's/\.rst//' | sed 's,^\./,,') + egrep ":doc:.$m" index.rst > /dev/null || echo $m +done diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..8889135 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,81 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +import datetime + +# top = os.path.realpath(os.path.dirname(os.path.dirname(__file__))) +# sys.path.insert(0, top) + + +source_suffix = ".rst" +master_doc = "index" +pygments_style = "sphinx" +html_theme_options = {"logo_only": True} +html_logo = "_static/logo.png" + + +# -- Project information ----------------------------------------------------- + +project = "Anemoi" + +author = "ECMWF" + +year = datetime.datetime.now().year +if year == 2024: + years = "2024" +else: + years = "2024-%s" % (year,) + +copyright = "%s, ECMWF" % (years,) + + +release = "0.1.0" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx_rtd_theme", + "nbsphinx", +] + +# Add any paths that contain templates here, relative to this directory. +# templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "'**.ipynb_checkpoints'"] + + +# https://www.notion.so/Deepnote-Launch-Buttons-63c642a5e875463495ed2341e83a4b2a + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +html_css_files = ["style.css"] diff --git a/docs/datasets/about.rst b/docs/datasets/about.rst new file mode 100644 index 0000000..f0b6f57 --- /dev/null +++ b/docs/datasets/about.rst @@ -0,0 +1,27 @@ +################## + Training dataset +################## + +Training datasets are large array-like objects encode in Zarr_ format. +They + +The array has the following dimensions: + +.. figure:: data.png + :alt: Data layout + +The first dimension is the time dimension, the second dimension are the +variables (e.g. temperature, pressure, etc), the third dimension is the +ensemble, and fourth dimension are the grid points values. + +This structure provides an efficient way to build the training dataset, +as input and output of the model are simply consecutive slices of the +array. + +.. code:: python + + x, y = ds[n], ds[n+1] + y_hat = model.predict(x) + loss = model.loss(y, y_hat) + +.. _zarr: https://zarr.readthedocs.io/ diff --git a/docs/datasets/build.png b/docs/datasets/build.png new file mode 100644 index 0000000..1dbe5ff Binary files /dev/null and b/docs/datasets/build.png differ diff --git a/docs/datasets/building.rst b/docs/datasets/building.rst new file mode 100644 index 0000000..1bd8ecc --- /dev/null +++ b/docs/datasets/building.rst @@ -0,0 +1,80 @@ +.. _datasets-building: + +################### + Building datasets +################### + +.. + .. figure:: build.png + +.. + :alt: Building datasets + +.. + :scale: 50% + +********** + Concepts +********** + +date + Throughout this document, the term `date` refers to a date and time, + not just a date. A training dataset is covers a continuous range of + dates with a given frequency. Missing dates are still part of the + dataset, but the data are missing and marked as such using NaNs. + Dates are always in UTC, and refer to date at which the data is + valid. For accumulations and fluxes, that would be the end of the + accumulation period. + +variable + A `variable` is meteorological parameter, such as temperature, wind, + etc. Multilevel parameters are treated as separate variables, one for + each level. For example, temperature at 850 hPa and temperature at + 500 hPa will be treated as two separate variables (`t_850` and + `t_500`). + +field + A `field` is a variable at a given date. It is represented by a array + of values at each grid point. + +source + The `source` is a software component that given a list of dates and + variables will return the corresponding fields. A example of source + is ECMWF's MARS archive, a collection of GRIB or NetCDF files, a + database, etc. See :ref:`dataset-sources` for more information. + +filter + A `filter` is a software component that takes as input the output of + a source or the output of another filter can modify the fields and/or + their metadata. For example, typical filters are interpolations, + renaming of variables, etc. See :ref:`dataset-filters` for more + information. + +************ + Operations +************ + +In order to build a training dataset, sources and filters are combined +using the following operations: + +join + The join is the process of combining several sources data. Each + source is expected to provide different variables at the same dates. + +pipe + The pipe is the process of transforming fields using filters. The + first step of a pipe is typically a source, a join or another pipe. + The following steps are filters. + +concat + The concatenation is the process of combining different sets of + operation that handle different dates. This is typically used to + build a dataset that spans several years, when the several sources + are involved, each providing a different period. + +***************** + Getting started +***************** + +.. literalinclude:: building.yaml + :language: yaml diff --git a/docs/datasets/building.yaml b/docs/datasets/building.yaml new file mode 100644 index 0000000..c991100 --- /dev/null +++ b/docs/datasets/building.yaml @@ -0,0 +1,44 @@ +description: Example dataset + +dates: + start: 2020-01-01 00:00:00 + end: 2023-12-31 18:00:00 + frequency: 6h + +build: + group_by: monthly + +input: + join: + - mars: + class: ea + param: [10u, 10v, 2d, 2t, msl, skt, sp, tcw, lsm, sdor, slor, z] + levtype: sfc + + - mars: + class: ea + param: [r, t, u, v, w, z] + levtype: pl + level: [50, 100, 150, 200, 250, 300, 400, 500, 700, 850, 925, 1000] + + - constants: + template: ${input.join.0.mars} + param: + - cos_latitude + - cos_longitude + - sin_latitude + - sin_longitude + - cos_julian_day + - cos_local_time + - sin_julian_day + - sin_local_time + - insolation + +output: + order_by: + - valid_datetime + - param_level + - number + statistics: param_level + remapping: + param_level: "{param}_{levelist}" diff --git a/docs/datasets/concat.png b/docs/datasets/concat.png new file mode 100644 index 0000000..0832de9 Binary files /dev/null and b/docs/datasets/concat.png differ diff --git a/docs/datasets/data.png b/docs/datasets/data.png new file mode 100644 index 0000000..cea4090 Binary files /dev/null and b/docs/datasets/data.png differ diff --git a/docs/datasets/filters.rst b/docs/datasets/filters.rst new file mode 100644 index 0000000..aefac54 --- /dev/null +++ b/docs/datasets/filters.rst @@ -0,0 +1,5 @@ +.. _dataset-filters: + +######### + Filters +######### diff --git a/docs/datasets/images.pptx b/docs/datasets/images.pptx new file mode 100644 index 0000000..b73a331 Binary files /dev/null and b/docs/datasets/images.pptx differ diff --git a/docs/datasets/join.png b/docs/datasets/join.png new file mode 100644 index 0000000..41c2082 Binary files /dev/null and b/docs/datasets/join.png differ diff --git a/docs/datasets/options.rst b/docs/datasets/options.rst new file mode 100644 index 0000000..a12e3c5 --- /dev/null +++ b/docs/datasets/options.rst @@ -0,0 +1,282 @@ +######### + Options +######### + +These are equivalent: + +.. code:: python + + ds = open_dataset(path) + ds = open_dataset(dataset=path) + ds = open_dataset({"dataset": path}) + +The last example is useful when the dataset is defined from a +configuration file: + +.. code:: python + + with open("config.yaml") as file: + config = yaml.safe_load(file) + + ds = open_dataset(config) + +When defining a dataset from another, you can either use a path or a +dataset: + +.. code:: python + + open_dataset(path, statistics=other_path) + open_dataset(path, statistics=other_dataset) + open_dataset(path, statistics={"dataset": other_path, ...}) + +This also applies when combining datasets: + +.. code:: python + + open_dataset(ensembles=[dataset1, dataset2, ...]) + open_dataset(ensembles=[path1, path2, ...]) + open_dataset(ensembles=[dataset1, path2, ...]) + open_dataset(ensembles=[{"dataset": path1, ...}, {"dataset": path2, ...}, ...]) + +********* + Options +********* + +.. code:: python + + open_dataset( + dataset, + start=None, + end=None, + frequency=None, + select=None, + drop=None, + reorder=None, + rename=None, + statistics=None, + thinning=None, + area=None, + ensembles=None, + grids=None, + method=None, + ) + +dataset +======= + +This is a path or URL to a ``zarr`` file that has been created with this +package, as described in :ref:`datasets-building`. + +.. code:: python + + from ecml_tools.data import open_dataset + + ds = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2") + ds = open_dataset("/path/to/datasets/aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2.zarr") + ds = open_dataset("https://example.com/aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2.zarr") + ds = open_dataset("s3://bucket/aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2.zarr") + +Alternatively, you can pass an already opened dataset: + +.. code:: python + + from ecml_tools.data import open_dataset + + ds1 = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2") + ds2 = open_dataset(ds1, start=1979, end=2020) + +start +===== + +This option let you subset the dataset by time. You can pass a date or a + +.. code:: python + + open_dataset(dataset, start=1980) + +end +=== + +As for the start option, you can pass a date or a string: + +.. code:: python + + open_dataset(dataset, end="2020-12-31") + +The following are equivalent way of describing ``start`` or ``end``: + +- ``2020`` and ``"2020"`` +- ``202306``, ``"202306"`` and ``"2023-06"`` +- ``20200301``, ``"20200301"`` and ``"2020-03-01"`` + +frequency +========= + +You can change the frequency of the dataset by passing a string with the + +.. code:: python + + ds = open_dataset("aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", frequency="6h") + +select +====== + +.. code:: python + + # Select '2t' and 'tp' in that order + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + select = ["2t", "tp"], + ) + +.. code:: python + + # Select '2t' and 'tp', but preserve the order in which they are in the dataset + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + select = {"2t", "tp"}, + ) + +drop +==== + +You can also drop some variables: + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + drop = ["10u", "10v"], + ) + +reorder +======= + +and reorder them: + +... using a list + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + reorder = ["2t", "msl", "sp", "10u", "10v"], + ) + +... or using a dictionary + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + reorder = {"2t": 0, "msl": 1, "sp": 2, "10u": 3, "10v": 4}, + ) + +rename +====== + +You can also rename variables: + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + rename = {"2t": "t2m"}, + ) + +This will be useful when your join datasets and do not want variables +from one dataset to override the ones from the other. + +.. _statistics: + +statistics +========== + +.. code:: python + + open_dataset(dataset, statistics=other_dataset) + +thinning +======== + +.. code:: python + + open_dataset(dataset, thinning=..., method="every-nth") + +area +==== + +******************** + Combining datasets +******************** + +When combining datasets, the statistics of the first dataset are used by +default. You can change this by setting the :ref:`statistics` option to +a different dataset, even if it is not part of the combination. See + +concat +====== + +You can concatenate two or more datasets along the dates dimension. The +package will check that all datasets are compatible (same resolution, +same variables, etc.). Currently, the datasets must be given in +chronological order with no gaps between them. + +.. code:: python + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1940-1978-1h-v2", + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2" + ) + +.. image:: concat.png + :alt: Concatenation + +Please note that you can pass more than two ``zarr`` files to the +function. + + **NOTE:** When concatenating file, the statistics are not recomputed; + it is the statistics of first file that are returned to the user. + +join +==== + +You can join two datasets that have the same dates, combining their +variables. + +.. code:: python + + from ecml_tools.data import open_dataset + + ds = open_dataset( + "aifs-ea-an-oper-0001-mars-o96-1979-2022-1h-v2", + "some-extra-parameters-from-another-source-o96-1979-2022-1h-v2", + ) + +.. image:: join.png + :alt: Join + +If a variable is present in more that one file, that last occurrence of +that variable will be used, and will be at the position of the first +occurrence of that name. + +.. image:: overlay.png + :alt: Overlay + +Please note that you can join more than two ``zarr`` files. + +ensembles +========= + +.. code:: python + + open_dataset(ensembles=[dataset1, dataset2, ...]) + +grids +===== + +.. code:: python + + open_dataset(grids=[dataset1, dataset2, ...], method=...) diff --git a/docs/datasets/overlay.png b/docs/datasets/overlay.png new file mode 100644 index 0000000..e221539 Binary files /dev/null and b/docs/datasets/overlay.png differ diff --git a/docs/datasets/sources.rst b/docs/datasets/sources.rst new file mode 100644 index 0000000..914d913 --- /dev/null +++ b/docs/datasets/sources.rst @@ -0,0 +1,41 @@ +.. _dataset-sources: + +######### + Sources +######### + +.. + ****** + +.. + mars + +.. + ****** + +.. + ****** + +.. + grib + +.. + ****** + +.. + ******** + +.. + netcdf + +.. + ******** + +.. + ********* + +.. + opendap + +.. + ********* diff --git a/docs/datasets/using.rst b/docs/datasets/using.rst new file mode 100644 index 0000000..e0d9900 --- /dev/null +++ b/docs/datasets/using.rst @@ -0,0 +1,3 @@ +################ + Using datasets +################ diff --git a/docs/examples.rst b/docs/examples.rst new file mode 100644 index 0000000..8f14102 --- /dev/null +++ b/docs/examples.rst @@ -0,0 +1,14 @@ +.. _examples: + +########## + Examples +########## + +Here is a list of example notebooks to illustrate how to access data, +create plots, and do machine learning using CliMetLab. + +.. toctree:: + :maxdepth: 2 + :glob: + + examples/* diff --git a/docs/examples/00-example1.ipynb b/docs/examples/00-example1.ipynb new file mode 100644 index 0000000..dec02a7 --- /dev/null +++ b/docs/examples/00-example1.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hello" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/firststeps.rst b/docs/firststeps.rst new file mode 100644 index 0000000..efe3dde --- /dev/null +++ b/docs/firststeps.rst @@ -0,0 +1,3 @@ +############# + First steps +############# diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..8f48dbe --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,67 @@ +#################################### + Welcome to Anemoi's documentation! +#################################### + +.. warning:: + + This documentation is work in progress. It is not yet ready. + Currently, the documentation is based on the one from the ecml-tools_ + project, which will be merged into Anemoi. + +*Anemoi* is a framework for developing machine learning weather +forecasting models. It comprises of components or packages for preparing +training datasets, conducting ML model training and a registry for +datasets and trained models. Anemoi provides tools for operational +inference, including interfacing to verification software. As a +framework it seeks to handle many of the complexities that +meteorological organisations will share, allowing them to easily train +models from existing recipes but with their own data. + +- :doc:`overview` +- :doc:`installing` +- :doc:`firststeps` +- :doc:`examples` + +.. toctree:: + :maxdepth: 1 + :hidden: + + overview + installing + firststeps + examples + +**Datasets** + +- :doc:`datasets/about` +- :doc:`datasets/building` +- :doc:`datasets/sources` +- :doc:`datasets/filters` +- :doc:`datasets/using` +- :doc:`datasets/options` + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Training datasets + + datasets/about + datasets/building + datasets/sources + datasets/filters + datasets/using + datasets/options + +********* + License +********* + +*Anemoi* is available under the open source `Apache License`__. + +.. __: http://www.apache.org/licenses/LICENSE-2.0.html + +.. _ecml-tools: https://github.com/ecmwf-lab/ecml-tools + +.. _pytorch: https://pytorch.org + +.. _zarr: https://zarr.readthedocs.io/ diff --git a/docs/installing.rst b/docs/installing.rst new file mode 100644 index 0000000..5ae334b --- /dev/null +++ b/docs/installing.rst @@ -0,0 +1,3 @@ +############ + Installing +############ diff --git a/docs/overview.rst b/docs/overview.rst new file mode 100644 index 0000000..5af7ff1 --- /dev/null +++ b/docs/overview.rst @@ -0,0 +1,6 @@ +########## + Overview +########## + +This documentation is currently being written. For now, only the +creation and use of training datasets is documented. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..c85e9f1 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,9 @@ +# These are the requirements for readthedoc +sphinx +sphinx_rtd_theme +nbsphinx + +# Also requires `brew install pandoc` on Mac +pandoc + +rstfmt diff --git a/ecml_tools/data/concat.py b/ecml_tools/data/concat.py index 5830439..12fa790 100644 --- a/ecml_tools/data/concat.py +++ b/ecml_tools/data/concat.py @@ -6,6 +6,7 @@ # nor does it submit to any jurisdiction. import logging +from functools import cached_property import numpy as np @@ -23,7 +24,8 @@ LOG = logging.getLogger(__name__) -class Concat(Combined): +class ConcatMixin: + def __len__(self): return sum(len(i) for i in self.datasets) @@ -65,6 +67,18 @@ def _get_slice(self, s): return np.concatenate(result) + @cached_property + def missing(self): + result = set() + offset = 0 + for d in self.datasets: + result = result | set(m + offset for m in d.missing) + offset += len(d) + return result + + +class Concat(ConcatMixin, Combined): + def check_compatibility(self, d1, d2): super().check_compatibility(d1, d2) self.check_same_sub_shapes(d1, d2, drop_axis=0) diff --git a/ecml_tools/data/dataset.py b/ecml_tools/data/dataset.py index f9be152..4706b18 100644 --- a/ecml_tools/data/dataset.py +++ b/ecml_tools/data/dataset.py @@ -83,6 +83,15 @@ def _subset(self, **kwargs): bbox = kwargs.pop("area") return Cropping(self, bbox)._subset(**kwargs) + # Keep last + if "shuffle" in kwargs: + from .subset import Subset + + shuffle = kwargs.pop("shuffle") + + if shuffle: + return Subset(self, self._shuffle_indices())._subset(**kwargs) + raise NotImplementedError("Unsupported arguments: " + ", ".join(kwargs)) def _frequency_to_indices(self, frequency): @@ -96,6 +105,11 @@ def _frequency_to_indices(self, frequency): return range(0, len(self), step) + def _shuffle_indices(self): + import numpy as np + + return np.random.permutation(len(self)) + def _dates_to_indices(self, start, end): from .misc import _as_first_date from .misc import _as_last_date diff --git a/ecml_tools/data/misc.py b/ecml_tools/data/misc.py index 6f4a83a..c522ba4 100644 --- a/ecml_tools/data/misc.py +++ b/ecml_tools/data/misc.py @@ -255,6 +255,16 @@ def _open_dataset(*args, zarr_root, **kwargs): for a in args: sets.append(_open(a, zarr_root)) + if "zip" in kwargs: + from .unchecked import zip_factory + + assert not sets, sets + return zip_factory(args, kwargs, zarr_root) + if "chain" in kwargs: + from .unchecked import chain_factory + + assert not sets, sets + return chain_factory(args, kwargs, zarr_root) if "join" in kwargs: from .join import join_factory diff --git a/ecml_tools/data/unchecked.py b/ecml_tools/data/unchecked.py new file mode 100644 index 0000000..0b42367 --- /dev/null +++ b/ecml_tools/data/unchecked.py @@ -0,0 +1,170 @@ +# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import logging +from functools import cached_property +from functools import wraps + +from .concat import ConcatMixin +from .debug import Node +from .forewards import Combined +from .misc import _auto_adjust +from .misc import _open + +LOG = logging.getLogger(__name__) + + +class check: + + def __init__(self, check): + self.check = check + + def __call__(self, method): + name = method.__name__ + check = self.check + + @wraps(method) + def wrapper(obj): + """ + This is a decorator that checks the compatibility of the datasets + before calling the method. If the datasets are compatible, it + will return the result of the method, otherwise it will raise an + exception. + """ + + for d in obj.datasets[1:]: + getattr(obj, check)(obj.datasets[0], d) + + return getattr(Combined, name).__get__(obj) + + return wrapper + + +class Unchecked(Combined): + + def tree(self): + return Node(self, [d.tree() for d in self.datasets]) + + def _subset(self, **kwargs): + assert not kwargs + return self + + def check_compatibility(self, d1, d2): + pass + + ########################################### + @property + @check("check_same_dates") + def dates(self): + pass + + @property + @check("check_same_resolution") + def resolution(self): + pass + + @property + def field_shape(self): + raise NotImplementedError() + + @property + @check("check_same_frequency") + def frequency(self): + raise NotImplementedError() + + @property + @check("check_same_grid") + def latitudes(self): + raise NotImplementedError() + + @property + @check("check_same_grid") + def longitudes(self): + raise NotImplementedError() + + @property + @check("check_same_variables") + def name_to_index(self): + raise NotImplementedError() + + @property + @check("check_same_variables") + def variables(self): + raise NotImplementedError() + + @property + @check("check_same_variables") + def statistics(self): + raise NotImplementedError() + + @property + def shape(self): + raise NotImplementedError() + + @property + def dtype(self): + raise NotImplementedError() + + @property + def grids(self): + raise NotImplementedError() + + +class Zip(Unchecked): + + def __len__(self): + return min(len(d) for d in self.datasets) + + def __getitem__(self, n): + return tuple(d[n] for d in self.datasets) + + @cached_property + def missing(self): + result = set() + for d in self.datasets: + result = result | d.missing + return result + + +class Chain(ConcatMixin, Unchecked): + """ + Same as Concat, but with no checks + """ + + def __len__(self): + return sum(len(d) for d in self.datasets) + + def __getitem__(self, n): + return tuple(d[n] for d in self.datasets) + + @property + def dates(self): + raise NotImplementedError() + + +def zip_factory(args, kwargs, zarr_root): + + zip = kwargs.pop("zip") + assert len(args) == 0 + assert isinstance(zip, (list, tuple)) + + datasets = [_open(e, zarr_root) for e in zip] + datasets, kwargs = _auto_adjust(datasets, kwargs) + + return Zip(datasets)._subset(**kwargs) + + +def chain_factory(args, kwargs, zarr_root): + + chain = kwargs.pop("chain") + assert len(args) == 0 + assert isinstance(chain, (list, tuple)) + + datasets = [_open(e, zarr_root) for e in chain] + datasets, kwargs = _auto_adjust(datasets, kwargs) + + return Chain(datasets)._subset(**kwargs)