diff --git a/.github/workflows/ci-binary-builder.yml b/.github/workflows/ci-binary-builder.yml index fea334ff..793e0956 100644 --- a/.github/workflows/ci-binary-builder.yml +++ b/.github/workflows/ci-binary-builder.yml @@ -21,8 +21,8 @@ jobs: arch: - x64 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6abb541b..3427e4f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,8 +21,8 @@ jobs: arch: - x64 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} @@ -43,15 +43,16 @@ jobs: - uses: julia-actions/julia-runtest@latest continue-on-error: ${{ matrix.version == 'nightly' }} - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v4 with: file: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} docs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: '1' - run: | diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..402a7a61 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,28 @@ +cff-version: "1.2.0" +authors: +- family-names: Barth + given-names: Alexander + orcid: "https://orcid.org/0000-0003-2952-5997" +doi: 10.5281/zenodo.11067062 +message: If you use this software, please cite our article in the + Journal of Open Source Software. +preferred-citation: + authors: + - family-names: Barth + given-names: Alexander + orcid: "https://orcid.org/0000-0003-2952-5997" + date-published: 2024-05-29 + doi: 10.21105/joss.06504 + issn: 2475-9066 + issue: 97 + journal: Journal of Open Source Software + publisher: + name: Open Journals + start: 6504 + title: "NCDatasets.jl: a Julia package for manipulating netCDF data + sets" + type: article + url: "https://joss.theoj.org/papers/10.21105/joss.06504" + volume: 9 +title: "NCDatasets.jl: a Julia package for manipulating netCDF data + sets" diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..b8e52839 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017-2024: Alexander Barth. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSE.md b/LICENSE.md deleted file mode 100644 index e044df7b..00000000 --- a/LICENSE.md +++ /dev/null @@ -1,22 +0,0 @@ -The NCDatasets.jl package is licensed under the MIT "Expat" License: - -> Copyright (c) 2017-2023: Alexander Barth. -> -> Permission is hereby granted, free of charge, to any person obtaining a copy -> of this software and associated documentation files (the "Software"), to deal -> in the Software without restriction, including without limitation the rights -> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -> copies of the Software, and to permit persons to whom the Software is -> furnished to do so, subject to the following conditions: -> -> The above copyright notice and this permission notice shall be included in all -> copies or substantial portions of the Software. -> -> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -> SOFTWARE. -> diff --git a/Project.toml b/Project.toml index 8a1986f3..745d88e5 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,7 @@ uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" keywords = ["netcdf", "climate and forecast conventions", "oceanography", "meteorology", "climatology", "opendap"] license = "MIT" desc = "Load and create NetCDF files in Julia" -version = "0.14.3" +version = "0.14.5" [deps] CFTime = "179af706-886a-5703-950a-314cd64e0468" @@ -19,8 +19,8 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" CFTime = "0.1.1" CommonDataModel = "0.3.4" DataStructures = "0.17, 0.18" -DiskArrays = "0.3.22, 0.4" -NetCDF_jll = "=400.701.400, =400.702.400, =400.902.5, =400.902.208, =400.902.209" +DiskArrays = "0.3.22, 0.4.4" +NetCDF_jll = "=400.701.400, =400.702.400, =400.902.5, =400.902.208, =400.902.209, =400.902.211" NetworkOptions = "1.2" julia = "1.6" diff --git a/README.md b/README.md index def13aae..f7c814b6 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![codecov.io](http://codecov.io/github/Alexander-Barth/NCDatasets.jl/coverage.svg?branch=master)](http://app.codecov.io/github/Alexander-Barth/NCDatasets.jl?branch=master) [![documentation stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://alexander-barth.github.io/NCDatasets.jl/stable/) [![documentation dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://alexander-barth.github.io/NCDatasets.jl/dev/) +[![DOI](https://joss.theoj.org/papers/10.21105/joss.06504/status.svg)](https://doi.org/10.21105/joss.06504) `NCDatasets` allows one to read and create netCDF files. @@ -35,11 +36,59 @@ Pkg.add("NCDatasets") This Manual is a quick introduction in using NCDatasets.jl. For more details you can read the [stable](https://alexander-barth.github.io/NCDatasets.jl/stable/) or [latest](https://alexander-barth.github.io/NCDatasets.jl/latest/) documentation. +* [Create a netCDF file](#create-a-netcdf-file) * [Explore the content of a netCDF file](#explore-the-content-of-a-netcdf-file) * [Load a netCDF file](#load-a-netcdf-file) -* [Create a netCDF file](#create-a-netcdf-file) * [Edit an existing netCDF file](#edit-an-existing-netcdf-file) +## Create a netCDF file + +The following gives an example of how to create a netCDF file by defining dimensions, variables and attributes. + +```julia +using NCDatasets +using DataStructures: OrderedDict +# This creates a new NetCDF file called file.nc. +# The mode "c" stands for creating a new file (clobber) +ds = NCDataset("file.nc","c") + +# Define the dimension "lon" and "lat" with the size 100 and 110 resp. +defDim(ds,"lon",100) +defDim(ds,"lat",110) + +# Define a global attribute +ds.attrib["title"] = "this is a test file" + +# Define the variables temperature with the attribute units +v = defVar(ds,"temperature",Float32,("lon","lat"), attrib = OrderedDict( + "units" => "degree Celsius", + "scale_factor" => 10, +)) + +# add additional attributes +v.attrib["comments"] = "this is a string attribute with Unicode Ω ∈ ∑ ∫ f(x) dx" + +# Generate some example data +data = [Float32(i+j) for i = 1:100, j = 1:110]; + +# write a single column +v[:,1] = data[:,1]; + +# write a the complete data set +v[:,:] = data; + +close(ds) +``` + +It is also possible to create the dimensions, the define the variable and set its value with a single call to `defVar`: + +```julia +using NCDatasets +ds = NCDataset("/tmp/test2.nc","c") +data = [Float32(i+j) for i = 1:100, j = 1:110] +v = defVar(ds,"temperature",data,("lon","lat")) +close(ds) +``` ## Explore the content of a netCDF file Before reading the data from a netCDF file, it is often useful to explore the list of variables and attributes defined in it. @@ -63,24 +112,28 @@ while to get the global attributes you can do: ```julia ds.attrib ``` -which produces a listing like: + +`NCDataset("file.nc")` produces a listing like: ``` Dataset: file.nc Group: / Dimensions - time = 115 + lon = 100 + lat = 110 Variables - time (115) - Datatype: Float64 - Dimensions: time + temperature (100 × 110) + Datatype: Float32 (Float32) + Dimensions: lon × lat Attributes: - calendar = gregorian - standard_name = time - units = days since 1950-01-01 00:00:00 -[...] + units = degree Celsius + scale_factor = 10 + comments = this is a string attribute with Unicode Ω ∈ ∑ ∫ f(x) dx + +Global attributes + title = this is a test file ``` ## Load a netCDF file @@ -89,7 +142,7 @@ Loading a variable with known structure can be achieved by accessing the variabl ```julia # The mode "r" stands for read-only. The mode "r" is the default mode and the parameter can be omitted. -ds = NCDataset("/tmp/test.nc","r") +ds = NCDataset("file.nc","r") v = ds["temperature"] # load a subset @@ -99,7 +152,7 @@ subdata = v[10:30,30:5:end] data = v[:,:] # load all data ignoring attributes like scale_factor, add_offset, _FillValue and time units -data2 = v.var[:,:] +data2 = v.var[:,:]; # load an attribute @@ -110,7 +163,7 @@ close(ds) In the example above, the subset can also be loaded with: ```julia -subdata = NCDataset("/tmp/test.nc")["temperature"][10:30,30:5:end] +subdata = NCDataset("file.nc")["temperature"][10:30,30:5:end] ``` This might be useful in an interactive session. However, the file `test.nc` is not directly closed (closing the file will be triggered by Julia's garbage collector), which can be a problem if you open many files. On Linux the number of opened files is often limited to 1024 (soft limit). If you write to a file, you should also always close the file to make sure that the data is properly written to the disk. @@ -123,52 +176,7 @@ data = NCDataset(filename,"r") do ds end # ds is closed ``` -## Create a netCDF file - -The following gives an example of how to create a netCDF file by defining dimensions, variables and attributes. -```julia -using NCDatasets -using DataStructures -# This creates a new NetCDF file /tmp/test.nc. -# The mode "c" stands for creating a new file (clobber) -ds = NCDataset("/tmp/test.nc","c") - -# Define the dimension "lon" and "lat" with the size 100 and 110 resp. -defDim(ds,"lon",100) -defDim(ds,"lat",110) - -# Define a global attribute -ds.attrib["title"] = "this is a test file" - -# Define the variables temperature with the attribute units -v = defVar(ds,"temperature",Float32,("lon","lat"), attrib = OrderedDict( - "units" => "degree Celsius")) - -# add additional attributes -v.attrib["comments"] = "this is a string attribute with Unicode Ω ∈ ∑ ∫ f(x) dx" - -# Generate some example data -data = [Float32(i+j) for i = 1:100, j = 1:110] - -# write a single column -v[:,1] = data[:,1] - -# write a the complete data set -v[:,:] = data - -close(ds) -``` - -It is also possible to create the dimensions, the define the variable and set its value with a single call to `defVar`: - -```julia -using NCDatasets -ds = NCDataset("/tmp/test2.nc","c") -data = [Float32(i+j) for i = 1:100, j = 1:110] -v = defVar(ds,"temperature",data,("lon","lat")) -close(ds) -``` ## Edit an existing netCDF file @@ -178,7 +186,7 @@ to open it with the `"a"` option. Here, for example, we add a global attribute * file created in the previous step. ```julia -ds = NCDataset("/tmp/test.nc","a") +ds = NCDataset("file.nc","a") ds.attrib["creator"] = "your name" close(ds); ``` diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 00000000..69bd5a8f --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,136 @@ +@manual{OGC_netCDF, + organization = {Open Geospatial Consortium}, + title = {{NetCDF Binary Encoding Extension Standard: NetCDF Classic and 64-bit Offset Format}}, + year = 2011, + month = 4, + number = {OGC 10-092r3}, + url = {http://www.opengis.net/doc/IS/netcdf-binary/1.0}, +} + +@inproceedings{Rew2006, + author = {Russell, Rew and Hartnett, Edward and Caron, John}, + year = {2006}, + month = {01}, + organization = {Conference: 22nd International Conference on Interactive Information Processing Systems for Meteorology, Oceanography, and Hydrology}, + title = {{NetCDF-4: Software Implementing an Enhanced Data Model for the Geosciences}} +} + +@article{Rew90, + author={Rew, R. and Davis, G.}, + journal={IEEE Computer Graphics and Applications}, + title={{NetCDF: an interface for scientific data access}}, + year={1990}, + volume={10}, + number={4}, + pages={76-82}, + doi={10.1109/38.56302} +} + +@manual{OGC_Zarr, + organization = {Open Geospatial Consortium}, + title = {{Zarr Storage Specification 2.0 Community Standard}}, + year = 2022, + month = 6, + number = {21-050r1}, + url = {http://www.opengis.net/doc/CS/zarr/2.0} +} + + +@article{Barth2022, + author = {A. Barth and A. Alvera-Azc{\'{a}}rate and C. Troupin and J.-M. Beckers}, + title = {{DINCAE} 2.0: multivariate convolutional neural network with error estimates to reconstruct sea surface temperature satellite and altimetry observations}, + year = 2022, + publisher = {Copernicus {GmbH}}, + journal = {Geoscientific Model Development}, + doi = {10.5194/gmd-2021-353}, +} + +@article{Doglioni2023, + AUTHOR = {Doglioni, F. and Ricker, R. and Rabe, B. and Barth, A. and Troupin, C. and Kanzow, T.}, + TITLE = {{Sea surface height anomaly and geostrophic current velocity from altimetry measurements over the Arctic Ocean (2011--2020)}}, + JOURNAL = {Earth System Science Data}, + VOLUME = {15}, + YEAR = {2023}, + NUMBER = {1}, + PAGES = {225--263}, + DOI = {10.5194/essd-15-225-2023} +} + +@article{Belgacem21, + AUTHOR = {Belgacem, M. and Schroeder, K. and Barth, A. and Troupin, C. and Pavoni, B. and Raimbault, P. and Garcia, N. and Borghini, M. and Chiggiato, J.}, + TITLE = {{Climatological distribution of dissolved inorganic nutrients in the western Mediterranean Sea (1981--2017)}}, + JOURNAL = {Earth System Science Data}, + VOLUME = {13}, + YEAR = {2021}, + NUMBER = {12}, + PAGES = {5915--5949}, + URL = {https://essd.copernicus.org/articles/13/5915/2021/}, + DOI = {10.5194/essd-13-5915-2021} +} + +@article{OceananigansJOSS, + author = {Ali Ramadhan and Gregory LeClaire Wagner and Chris Hill and Jean-Michel Campin and Valentin Churavy and Tim Besard and Andre Souza and Alan Edelman and Raffaele Ferrari and John Marshall}, + title = {{Oceananigans.jl: Fast and friendly geophysical fluid dynamics on GPUs}}, + year = {2020}, + publisher = {The Open Journal}, + volume = {5}, + number = {53}, + pages = {2018}, + journal = {Journal of Open Source Software}, + doi = {10.21105/joss.02018}, + url = {https://doi.org/10.21105/joss.02018} +} + +@ARTICLE{Shahzadi21, + AUTHOR={Shahzadi, K. and Pinardi, N. and Barth, A. and Troupin, C. and Lyubartsev, V. and Simoncelli, S.}, + TITLE={{A New Global Ocean Climatology}}, + JOURNAL={Frontiers in Environmental Science}, + VOLUME={9}, + YEAR={2021}, + URL={https://www.frontiersin.org/articles/10.3389/fenvs.2021.711363}, + DOI={10.3389/fenvs.2021.711363}, + ISSN={2296-665X} +} + + +@misc{NCDatasets, + author = {Alexander Barth}, + title = {{NCDatasets: A julia package for manipulating netCDF data sets}}, + year = {2023}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/Alexander-Barth/NCDatasets.jl}}, + commit = {90ed5641684604096558a77020038583e1f2459f} +} + + +@unpublished{SpeedyWeather, + author = {Milan Klöwer and Maximilian Gelbrecht and Daisuke Hotta and Justin Willmert and Simone Silvestri and Gregory L Wagner and Alistair White and Sam Hatfield and Tom Kimpson and Navid C Constantinou and Chris Hill}, + title = {{SpeedyWeather.jl: Reinventing atmospheric general circulation models towards interactivity and extensibility}}, + year = {2023}, + publisher = {The Open Journal}, + journal = {Journal of Open Source Software (submitted)}, +} + + +@misc{Eaton2023, + author = {Brian Eaton and Jonathan Gregory and Bob Drach and Karl Taylor and Steve Hankin and Jon Blower and John Caron and Rich Signell and Phil Bentley and Greg Rappa and Heinke Höck and Alison Pamment and Martin Juckes and Martin Raspaud and Randy Horne and Timothy Whiteaker and David Blodgett and Charlie Zender and Daniel Lee and David Hassell and Alan D. Snow and Tobias Kölling and Dave Allured and Aleksandar Jelenak and Anders Meier Soerensen and Lucile Gaultier and Sylvain Herlédan and Fernando Manzano and Lars Bärring and Christopher Barker and Sadie Bartholomew}, + title = {{NetCDF Climate and Forecast (CF) Metadata Conventions v1.11}}, + publisher = {CF Conventions Committee}, + year = 2023, + urldate = {2023-12-05}, + url = {http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html}, +} + + +@article{Hassell2017, + AUTHOR = {Hassell, D. and Gregory, J. and Blower, J. and Lawrence, B. N. and Taylor, K. E.}, + TITLE = {{A data model of the Climate and Forecast metadata conventions (CF-1.6) with a software implementation (cf-python v2.1)}}, + JOURNAL = {Geoscientific Model Development}, + VOLUME = {10}, + YEAR = {2017}, + NUMBER = {12}, + PAGES = {4619--4646}, + URL = {https://gmd.copernicus.org/articles/10/4619/2017/}, + DOI = {10.5194/gmd-10-4619-2017} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..ab0e3340 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,79 @@ +--- +title: 'NCDatasets.jl: a Julia package for manipulating netCDF data sets' +tags: + - julia + - netcdf + - oceanography + - meteorology + - earth-observation + - climatology + - opendap + - climate-and-forecast-conventions +authors: + - name: Alexander Barth + orcid: 0000-0003-2952-5997 + affiliation: 1 +affiliations: + - name: GHER, University of Liège, Liège, Belgium + index: 1 +date: 13 January 2024 +bibliography: paper.bib +--- + +# Summary + +NCDatasets is a Julia package that allows users to read, create and modify netCDF files (Network Common Data Format). It is based on the Unidata netCDF library [@Rew90; Rew2006; @OGC_netCDF] which also supports reading data from remote servers using OPeNDAP (Open-source Project for a Network Data Access Protocol, https://www.opendap.org) and the Zarr file format [@OGC_Zarr]. These additional formats are also accessible to users of NCDatasets. + +The aim of NCDatasets is to expose the data and metadata stored in the NetCDF file as lazy data-structures (in particular arrays and dictionaries) used in Julia. +Lazy in this context means that only the requested subset of data is loaded into RAM or written to the disk. One of the design goals of NCDatasets and the netCDF library in general is being able to work with datasets which are potentially larger than the total amount of RAM in a system and to process that data per subset. + +NetCDF allows users to add metadata to datasets and individual variables in form of a list of key value-pairs called attributes. The meaning of these attributes is +standardized in the CF conventions [@Eaton2023]. While originally proposed for NetCDF files, the CF conventions are now also applied in the context of other formats like GRIB (e.g. the Julia package [GRIBDatasets](https://github.com/JuliaGeo/GRIBDatasets.jl) or the python package [cfgrib](https://github.com/ecmwf/cfgrib)). + + +# Statement of need + +NetCDF is a commonly used data format in Earth sciences (in particular oceanography, atmospheric sciences and climatology) to store model data, satellite observations and in situ observations. It is particularly well established as a format for distributing and archiving data. The Julia programming language with its native array types, just-in-time compilation and automatic function specialization based on data types are well suited for processing and analyzing large amounts of data often found in Earth sciences. +Therefore, a convenient API mapping the concepts for the NetCDF format and CF convention to the corresponding equivalents of the Julia programming language is desirable. +There are currently 64 registered Julia packages (as for 15 January 2024) that have NCDatasets as direct or indirect dependency (not counting for optional dependencies). +For example, NCDatasets is used with satellite data [@Barth2022; @Doglioni2023], in situ observations [@Belgacem21; @Shahzadi21] as well as numerical ocean models [@OceananigansJOSS] and atmospheric models [@SpeedyWeather]. + + +# Installation + +NCDatasets supports Julia 1.6 and later and can be installed with the Julia package manager using the following Julia commands: + +```julia +using Pkg +Pkg.add("NCDatasets") +``` + +This will automatically install all dependencies and in particular the Unidata netCDF C library for which compiled binaries are currently available for Linux, FreeBSD, Mac OS and Windows thanks to the efforts of the [Yggdrasil.jl](https://github.com/JuliaPackaging/Yggdrasil/) project. + +# Features + +The main objects in the netCDF data model are the dataset (typically representing a whole file), variables (named n-dimensional arrays with named dimensions), dimensions (mapping the dimension names to the corresponding length), attributes and groups (a dataset contained within a dataset). Groups can be recursively nested. Variable names must be unique within a given group, but in two different groups, variable names can be re-used. Current features of NCDatasets include: + +* Attributes, dimensions and groups are exposed to users as dictionary-like objects. Modifying them will directly modify the underlying NetCDF file as long as the file is open in write mode. +* Variables are exposed as array-like objects. Indexing these arrays with the usual Julia syntax will result in loading the corresponding subset into memory. Likewise, assigning a value to a subset will write the data to the disk. +* The netCDF C API provides several functions to query information about the various objects of the netCDF data model. It is possible to query the data and metadata of a NetCDF file in the same way that one would query an array or dictionary. +* Every time a netCDF variable is loaded the required memory is automatically allocated. Once this memory is no longer used it will be deallocated by Julia's garbage collector. For high-performance applications, the repeated allocation and deallocation can cause a significant performance overhead. For this use-case, NCDatasets provides in-place variants for loading data. +* Data stored in a contiguous ragged array representation [@Hassell2017; @Eaton2023] are loaded as a vector of vectors. It is typically used to load a list of in situ profiles or time series, each of different length. +* Storage parameters like compression and data chunks can be queried and defined. +* Data transformations defined via the CF conventions are applied per default (including scaling, adding an offset, conversion to the `DateTime` structure). Several calendars are standardized in the CF conventions (standard, Gregorian, proleptic Gregorian, Julian, all leap, no leap, 360 day). Where possible, dates are automatically converted to Julia's native date time type, which uses the proleptic Gregorian calendar conforming to the ISO 8601 standard. Date types are handled using the package [CFTimes](https://github.com/JuliaGeo/CFTimes.jl) (originally part of NCDatasets) +* Additional functionality includes multi-file support (virtually concatenating variables of multiple NetCDF variable spanning over multiple files), a view of the variable and datasets (virtual subset without loading the whole data in memory), subset variables and dataset using coordinate values instead of indices using the package [CommonDataModel](https://github.com/JuliaGeo/CommonDataModel.jl) (also originally part of NCDatasets). + + +# Similar software + +The Julia package [NetCDF.jl](https://github.com/JuliaGeo/NetCDF.jl) from Fabian Gans and contributors is an alternative to this package which supports a more Matlab/Octave-like interface for reading and writing netCDF files while this package, NCDatasets, is more influenced by the python [netCDF4](https://github.com/Unidata/netcdf4-python) package. In the R community, the packages [RNetCDF](https://github.com/mjwoods/RNetCDF) and [ncdf4](https://cirrus.ucsd.edu/~pierce/ncdf/) fulfill a similar role. + +# Acknowledgements + +I thank [all contributors](https://github.com/Alexander-Barth/NCDatasets.jl/graphs/contributors) to this package, among others, George Datseris, Tristan Carion, Martijn Visser, Charles Troupin, Rafael Schouten, Argel Ramírez Reyes, Kenechukwu Uba, Philippe Roy, Gregory L. Wagner, Gael Forget and Haakon Ludvig Langeland Ervik as well as Unidata for the [netCDF C library](https://github.com/Unidata/netcdf-c) and their time and efforts responding to my questions and issues. All contributors to the [Yggdrasil.jl](https://github.com/JuliaPackaging/Yggdrasil/) project for their effort in building the netCDF library and the required dependencies are also acknowledged. + +# Funding + +Acknowledgment is given to the F.R.S.-FNRS (Fonds de la Recherche Scientifique de Belgique) for funding the position of Alexander Barth. This work was partly performed with funding from the Blue-Cloud 2026 project under the Horizon Europe programme, Grant Agreement No. 101094227. + +# References diff --git a/paper/paper.pdf b/paper/paper.pdf new file mode 100644 index 00000000..54d6ee58 Binary files /dev/null and b/paper/paper.pdf differ diff --git a/src/cfvariable.jl b/src/cfvariable.jl index a9c339ce..ab768204 100644 --- a/src/cfvariable.jl +++ b/src/cfvariable.jl @@ -114,6 +114,7 @@ function defVar(ds::NCDataset,name::SymbolOrString,vtype::DataType,dimnames; typeid = nc_def_vlen(ds.ncid, typename, ncType[eltype(vtype)]) else # base-type + haskey(ncType, vtype) || error("$vtype not supported") ncType[vtype] end diff --git a/src/dimensions.jl b/src/dimensions.jl index b54244c8..3626946e 100644 --- a/src/dimensions.jl +++ b/src/dimensions.jl @@ -51,7 +51,7 @@ defDim(ds,"time",Inf) defVar(ds,"unlimited_variable",Float64,("lon","lat","time")) @show ds.dim["time"] # returns 0 as no data is added -ds["unlimited_variable"][:,:,:] = randn(10,10,4) +ds["unlimited_variable"][:,:,1:4] = randn(10,10,4) @show ds.dim["time"] # returns now 4 as 4 time slice have been added close(ds) diff --git a/src/netcdf_c.jl b/src/netcdf_c.jl index 39bde4fa..3d27b558 100644 --- a/src/netcdf_c.jl +++ b/src/netcdf_c.jl @@ -593,6 +593,12 @@ function nc_put_att(ncid::Integer,varid::Integer,name::SymbolOrString,data::Vect nc_put_att(ncid,varid,name,ncType[T],data) end +function nc_put_att(ncid::Integer,varid::Integer,name::SymbolOrString,data::Vector{Any}) + T = promote_type(typeof.(data)...) + @debug "promoted type for attribute $T" + nc_put_att(ncid,varid,name,ncType[T],T.(data)) +end + # convert e.g. ranges to vectors function nc_put_att(ncid::Integer,varid::Integer,name::SymbolOrString,data::AbstractVector) nc_put_att(ncid,varid,name,Vector(data)) @@ -868,7 +874,7 @@ function nc_put_vara(ncid::Integer,varid::Integer,startp,countp, end function nc_get_vara!(ncid::Integer,varid::Integer,startp,countp,ip) - @debug "nc_get_vara!",startp,indexp + @debug "nc_get_vara!",startp,countp check(ccall((:nc_get_vara,libnetcdf),Cint,(Cint,Cint,Ptr{Csize_t},Ptr{Csize_t},Ptr{Nothing}),ncid,varid,startp,countp,ip)) end @@ -2177,7 +2183,8 @@ end function nc_rc_get(key) p = ccall((:nc_rc_get,libnetcdf),Cstring,(Cstring,),key) - if p !== C_NULL + + if p != C_NULL unsafe_string(p) else error("NetCDF: nc_rc_get: unable to get key $key") diff --git a/src/variable.jl b/src/variable.jl index 171a6c02..370372d3 100644 --- a/src/variable.jl +++ b/src/variable.jl @@ -92,10 +92,9 @@ function checkbuffer(len,data) end end -@inline function unsafe_load!(ncvar::Variable, data, indices::Union{Integer, UnitRange, StepRange, Colon}...) +@inline function unsafe_load!(ncvar::Variable, data, indices::Union{Integer, UnitRange, StepRange, CartesianIndex, CartesianIndices, Colon}...) sizes = size(ncvar) - normalizedindices = normalizeindexes(sizes, indices) - ind = to_indices(ncvar,normalizedindices) + ind = to_indices(ncvar,indices) start,count,stride,jlshape = ncsub(ncvar,ind) @@ -137,7 +136,7 @@ load!(ds["temp"].var,data,:,1) # loads the 1st column array must be `UInt8` and cannot be the julia `Char` type, because the julia `Char` type uses 4 bytes and the NetCDF `NC_CHAR` only 1 byte. """ -@inline function load!(ncvar::Variable{T,N}, data::AbstractArray{T}, indices::Union{Integer, UnitRange, StepRange, Colon}...) where {T,N} +@inline function load!(ncvar::Variable{T,N}, data::AbstractArray{T}, indices::Union{Integer, UnitRange, StepRange, CartesianIndex, CartesianIndices, Colon}...) where {T,N} unsafe_load!(ncvar, data, indices...) end @@ -427,14 +426,11 @@ end _write_data_to_nc(v::Variable, data) = _write_data_to_nc(v, data, 1) -function _write_data_to_nc(v::Variable{T, N}, data, indexes::StepRange{<:Integer,<:Integer}...) where {T, N} - start,count,stride,jlshape = ncsub(v,indexes) - nc_put_vars(v.ds.ncid,v.varid,start,count,stride,T.(data)) -end +function _write_data_to_nc(v::Variable{T}, data, indexes::AbstractRange{<:Integer}...) where T + ind = prod(length.(indexes)) == 1 ? first.(indexes) : to_indices(v,indexes) -function _write_data_to_nc(v::Variable, data, indexes::Union{AbstractRange{<:Integer}}...) - ind = prod(length.(indexes)) == 1 ? first.(indexes) : normalizeindexes(size(v),indexes) - return _write_data_to_nc(v, data, ind...) + start,count,stride,jlshape = ncsub(v,indexes) + return nc_put_vars(v.ds.ncid,v.varid,start,count,stride,T.(data)) end function eachchunk(v::Variable) @@ -452,19 +448,6 @@ haschunks(v::Variable) = (_chunking(v)[1] == :contiguous ? DiskArrays.Unchunked( eachchunk(v::CFVariable{T,N,<:Variable}) where {T,N} = eachchunk(v.var) haschunks(v::CFVariable{T,N,<:Variable}) where {T,N} = haschunks(v.var) -_normalizeindex(n,ind::Base.OneTo) = 1:1:ind.stop -_normalizeindex(n,ind::Colon) = 1:1:n -_normalizeindex(n,ind::Integer) = ind:1:ind -_normalizeindex(n,ind::UnitRange) = StepRange(ind) -_normalizeindex(n,ind::StepRange) = ind -_normalizeindex(n,ind) = error("unsupported index") - -# indexes can be longer than sz -function normalizeindexes(sz,indexes) - return ntuple(i -> _normalizeindex(sz[i],indexes[i]), length(sz)) -end - - # computes the size of the array `a` after applying the indexes # size(a[indexes...]) == size_getindex(a,indexes...) diff --git a/test/perf/README.md b/test/perf/README.md index 874149c8..cecfb5b7 100644 --- a/test/perf/README.md +++ b/test/perf/README.md @@ -1,11 +1,5 @@ # Benchmarks -The operating systems typically caches access to the file system. -To make these benchmarks more realistic, the file system caches is dropped at every iteration so that the disk IO *is* included in the reported run times. -On Linux, the caches are dropped by writing `3` to the file `/proc/sys/vm/drop_caches` however this requires super user privileges. -These benchmarks require a Linux operating system (as dropping file caches is OS-specific). - - ## Installation ### Julia packages @@ -43,16 +37,33 @@ These are the steps to run the benchmark: julia generate_data.jl ``` -* As a *root user*, run the shell script `benchmark.sh`. It is necessary that the root user has access to the Julia, python and R netCDF packages (NCDatasets, netCDF4 and ncdf4 respectively). +* Run the shell script `benchmark.sh`. ```bash ./benchmark.sh ``` +The script will output a markdown table with the benchmark statistics. + +## Dropping file caches + +The operating systems typically caches access to the file system. +To make these benchmarks more realistic, the file system caches can be dropped at every iteration using the benchmark script with the option `--drop-caches` +so that the disk IO *is* included in the reported run times. +On Linux, the caches are dropped by writing `3` to the file `/proc/sys/vm/drop_caches` however this requires super user privileges. +In this case, these benchmarks require a Linux operating system (as dropping file caches is OS-specific). + + +* As a *root user*, run the shell script `benchmark.sh`. It is necessary that the root user has access to the Julia, python and R netCDF packages (NCDatasets, netCDF4 and ncdf4 respectively). + +```bash +./benchmark.sh --drop-caches +``` + If all packages are installed in the home directory of an unpriviledges user e.g. `my_user_name`, they can be made available to the root user changing temporarily the `HOME` environement variable to `/home/my_user_name` in the root shell before running `./benchmark.sh`: ```bash -HOME=/home/my_user_name ./benchmark.sh +HOME=/home/my_user_name ./benchmark.sh --drop-caches ``` -The script will output a markdown table with the benchmark statistics. +The table in the [README file](https://github.com/Alexander-Barth/NCDatasets.jl/blob/master/README.md) are obtained with this option enabled. diff --git a/test/perf/benchmark-R-ncdf4.R b/test/perf/benchmark-R-ncdf4.R index 584cc3cb..b57a2268 100644 --- a/test/perf/benchmark-R-ncdf4.R +++ b/test/perf/benchmark-R-ncdf4.R @@ -6,15 +6,18 @@ library(ncdf4) library(microbenchmark) +print(R.version.string) print(paste("ncdf4 version: ",packageVersion("ncdf4"))) fname = "filename_fv.nc" -process <- function(fname) { - # drop file caches; requires root - fileConn<-file("/proc/sys/vm/drop_caches",open = "wt") - writeLines("3", fileConn) - close(fileConn) +process <- function(fname,drop_caches) { + if (drop_caches) { + # drop file caches; requires root + fileConn<-file("/proc/sys/vm/drop_caches",open = "wt") + writeLines("3", fileConn) + close(fileConn) + } nc = nc_open(fname) @@ -29,14 +32,16 @@ process <- function(fname) { return(tot/nmax) } +drop_caches <- "--drop-caches" %in% commandArgs(trailingOnly=TRUE) +print(paste("drop caches: ",drop_caches)) start_time <- Sys.time() -tot = process(fname) +tot = process(fname,drop_caches) end_time <- Sys.time() print(paste("time ",end_time - start_time)) print(paste("result ",tot)) -mbm <- microbenchmark("ncdf4" = process(fname),times=100) +mbm <- microbenchmark("ncdf4" = process(fname,drop_caches),times=100) fileConn<-file("R-ncdf4.txt",open = "wt") diff --git a/test/perf/benchmark-julia-NCDatasets.jl b/test/perf/benchmark-julia-NCDatasets.jl index 347a0505..a1fe4db6 100644 --- a/test/perf/benchmark-julia-NCDatasets.jl +++ b/test/perf/benchmark-julia-NCDatasets.jl @@ -13,9 +13,11 @@ function compute(v) return tot/size(v,3) end -function process(fname) - # drop file caches; requires root - write("/proc/sys/vm/drop_caches","3") +function process(fname,drop_caches) + if drop_caches + # drop file caches; requires root + write("/proc/sys/vm/drop_caches","3") + end ds = NCDataset(fname,"r") do ds v = ds["v1"]; @@ -24,12 +26,15 @@ function process(fname) end end -fname = "filename_fv.nc" -tot = process(fname) +drop_caches = "--drop-caches" in ARGS +println("Julia ",VERSION) +println("drop caches: ",drop_caches) +fname = "filename_fv.nc" +tot = process(fname,drop_caches) println("result ",tot) -bm = run(@benchmarkable process(fname) samples=100 seconds=10000) +bm = run(@benchmarkable process(fname,drop_caches) samples=100 seconds=10000) @show bm diff --git a/test/perf/benchmark-python-netCDF4.py b/test/perf/benchmark-python-netCDF4.py index 661103b3..cd64c8be 100644 --- a/test/perf/benchmark-python-netCDF4.py +++ b/test/perf/benchmark-python-netCDF4.py @@ -5,6 +5,7 @@ import netCDF4 import numpy as np import timeit +import sys def compute(v): tot = 0 @@ -13,9 +14,10 @@ def compute(v): return tot/v.shape[0] -def process(fname): - with open("/proc/sys/vm/drop_caches","w") as f: - f.write("3") +def process(fname,drop_caches): + if drop_caches: + with open("/proc/sys/vm/drop_caches","w") as f: + f.write("3") with netCDF4.Dataset(fname) as ds: v = ds["v1"] @@ -24,15 +26,20 @@ def process(fname): if __name__ == "__main__": + drop_caches = "--drop-caches" in sys.argv + + print("Python ",sys.version) + print("drop caches: ",drop_caches) + fname = "filename_fv.nc"; - tot = process(fname) + tot = process(fname,drop_caches) print("result ",tot) setup = "from __main__ import process" print("python-netCDF4 version ",netCDF4.__version__) - benchtime = timeit.repeat(lambda: process(fname), setup=setup,number = 1, repeat = 100) + benchtime = timeit.repeat(lambda: process(fname,drop_caches), setup=setup,number = 1, repeat = 100) with open("python-netCDF4.txt","w") as f: for bt in benchtime: print(bt,file=f) diff --git a/test/perf/benchmark.sh b/test/perf/benchmark.sh index e4a76c4b..e3acc2c3 100755 --- a/test/perf/benchmark.sh +++ b/test/perf/benchmark.sh @@ -1,7 +1,8 @@ #!/bin/bash -julia benchmark-julia-NCDatasets.jl -python3 benchmark-python-netCDF4.py -Rscript benchmark-R-ncdf4.R +args="$@" +julia benchmark-julia-NCDatasets.jl $args +python3 benchmark-python-netCDF4.py $args +Rscript benchmark-R-ncdf4.R $args julia summary.jl diff --git a/test/test_attrib.jl b/test/test_attrib.jl index e5f76970..cdde0fca 100644 --- a/test/test_attrib.jl +++ b/test/test_attrib.jl @@ -126,4 +126,14 @@ end rm(filename) + #filename = "/tmp/mytest.nc" + +# test untyped attributes +filename = tempname() +vector_attrib = Any[Int32(1),Int32(2)] +ds = NCDataset(filename,"c") +# test deletion of attributes +ds.attrib["vector_attrib"] = vector_attrib +@test ds.attrib["vector_attrib"] == vector_attrib +close(ds) diff --git a/test/test_lowlevel.jl b/test/test_lowlevel.jl index c3bc0fcf..9bf26fb2 100644 --- a/test/test_lowlevel.jl +++ b/test/test_lowlevel.jl @@ -48,7 +48,18 @@ for sampledata in samples # reverse order varid = NCDatasets.nc_def_var(ncid, varname, xtype, reverse(dimids)) NCDatasets.nc_put_att(ncid, varid, "attr-string-list",["one","two"]) + + # test nc_put_var1 + # test nc_get_var1 + index = [1 for i in 1:ndims(sampledata)] .- 1 + NCDatasets.nc_put_var1(ncid,varid,index,first(sampledata)) + @test NCDatasets.nc_get_var1(T,ncid,varid,index) == first(sampledata) + + # test nc_put_var NCDatasets.nc_put_var(ncid, varid, sampledata) + + #@test first(sampledata) == var1 + NCDatasets.nc_close(ncid) # load data @@ -57,6 +68,12 @@ for sampledata in samples xtype2 = NCDatasets.nc_inq_vartype(ncid,varid) @test xtype == xtype + name2,jltype2,dimids2,natts2 = NCDatasets.nc_inq_var(ncid,varid) + @test name2 == varname + @test jltype2 == T + @test dimids2 == reverse(dimids) + @test natts2 == 1 + attrval = NCDatasets.nc_get_att(ncid, varid, "attr-string-list") @test attrval == ["one","two"] @@ -83,3 +100,24 @@ NCDatasets.nc_close(ncid) ncid = NCDatasets.nc_open(split("$(filename)#foo",'#')[1],NCDatasets.NC_NOWRITE) NCDatasets.nc_close(ncid) + + +# Set/get netcdf rc configuration +# https://github.com/Unidata/netcdf-c/blob/main/docs/auth.md + +if NCDatasets.netcdf_version() > v"4.9.0" + NCDatasets.nc_rc_set("HTTP.SSL.VALIDATE","1") + @test NCDatasets.nc_rc_get("HTTP.SSL.VALIDATE") == "1" +end + +@test_throws ErrorException NCDatasets.nc_rc_get("does_not_exists") + +# test NCDatasets.nc_inq_filter_avail + +filename = tempname() +mode = NCDatasets.NC_CLOBBER +ncid = NCDatasets.nc_create(filename,mode) +id = 32015 # Zstandard +# Zstandard is not available for NetCDF 3 files +@test !NCDatasets.nc_inq_filter_avail(ncid,id) +NCDatasets.nc_close(ncid) diff --git a/test/test_variable.jl b/test/test_variable.jl index 0393444f..4f63d0f5 100644 --- a/test/test_variable.jl +++ b/test/test_variable.jl @@ -288,3 +288,25 @@ data2 = zeros(Int,10) # asking too many elements @test_throws BoundsError NCDatasets.load!(ds["data"].var,data2,1:10) close(ds) + +# issue 250 +fname = tempname() +ds = NCDataset(fname,"c") +defDim(ds,"lon",100) +defDim(ds,"lat",110) +v = defVar(ds,"temperature",Float32,("lon","lat")) +data = [Float32(i+j) for i = 1:100, j = 1:110]; +v[:,:] = data; +close(ds) +ds = NCDataset(fname) +@test ds["temperature"][CartesianIndices((1:10,10:30))] == data[CartesianIndices((1:10,10:30))] +@test ds["temperature"][CartesianIndex(1,1)] == data[CartesianIndex(1,1)] + +# read in-place +v = zeros(Float32, 10, 21); +NCDatasets.load!(variable(ds, "temperature"), v, CartesianIndices((1:10,10:30))) +@test v[:,:] == data[CartesianIndices((1:10,10:30))] +vv = [1.0f0] +NCDatasets.load!(variable(ds, "temperature"), vv, CartesianIndex(5,5)) +@test vv[1] == data[CartesianIndex(5,5)] +close(ds) diff --git a/test/test_writevar.jl b/test/test_writevar.jl index b7cac335..47b7cd50 100644 --- a/test/test_writevar.jl +++ b/test/test_writevar.jl @@ -12,6 +12,8 @@ ds = NCDataset(filename,"c") defDim(ds,"lon",sz[1]) defDim(ds,"lat",sz[2]) +# vartype not supported +@test_throws ErrorException defVar(ds,"var-DT",DateTime,("lon","lat")) # variables for T in [UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64,Float32,Float64]