From 72c5e7e4a50be10d814330699e79692ab3ac1aef Mon Sep 17 00:00:00 2001 From: Maarten Pronk <8655030+evetion@users.noreply.github.com> Date: Fri, 21 Jun 2024 21:17:47 +0200 Subject: [PATCH] Add transaction support to `write`. (#72) * Add transactions to writing. * Use Tables.getcolumn instead of getproperty. * Roll back transaction on error. --------- Co-authored-by: Max Freudenberg --- .JuliaFormatter.toml | 10 +++ src/io.jl | 149 +++++++++++++++++++++++++++++++---------- src/utils.jl | 5 +- test/runtests.jl | 156 ++++++++++++++++++++++++++++++------------- 4 files changed, 235 insertions(+), 85 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..d4f2f07 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,10 @@ +# Options for the JuliaFormatter auto syntax formatting tool. +# https://domluna.github.io/JuliaFormatter.jl/stable/ + +whitespace_ops_in_indices = true +remove_extra_newlines = true +always_for_in = true +whitespace_typedefs = true + +# And add other options we like: +separate_kwargs_with_semicolon = true diff --git a/src/io.jl b/src/io.jl index acbafed..0da6796 100644 --- a/src/io.jl +++ b/src/io.jl @@ -21,7 +21,7 @@ function find_driver(fn::AbstractString) AG.extensiondriver(fn) end -const lookup_type = Dict{Tuple{DataType,Int},AG.OGRwkbGeometryType}( +const lookup_type = Dict{Tuple{DataType, Int}, AG.OGRwkbGeometryType}( (AG.GeoInterface.PointTrait, 2) => AG.wkbPoint, (AG.GeoInterface.PointTrait, 3) => AG.wkbPoint25D, (AG.GeoInterface.PointTrait, 4) => AG.wkbPointZM, @@ -42,7 +42,6 @@ const lookup_type = Dict{Tuple{DataType,Int},AG.OGRwkbGeometryType}( (AG.GeoInterface.MultiPolygonTrait, 4) => AG.wkbMultiPolygonZM, ) - """ read(fn::AbstractString; kwargs...) read(fn::AbstractString, layer::Union{Integer,AbstractString}; kwargs...) @@ -61,7 +60,7 @@ function read(fn::AbstractString; kwargs...) return t end -function read(fn::AbstractString, layer::Union{Integer,AbstractString}; kwargs...) +function read(fn::AbstractString, layer::Union{Integer, AbstractString}; kwargs...) startswith(fn, "/vsi") || occursin(":", fn) || isfile(fn) || error("File not found.") t = AG.read(fn; kwargs...) do ds return read(ds, layer) @@ -72,7 +71,11 @@ end function read(ds, layer) df, gnames, sr = AG.getlayer(ds, layer) do table if table.ptr == C_NULL - throw(ArgumentError("Given layer id/name doesn't exist. For reference this is the dataset:\n$ds")) + throw( + ArgumentError( + "Given layer id/name doesn't exist. For reference this is the dataset:\n$ds", + ), + ) end names, _ = AG.schema_names(AG.getfeaturedefn(first(table))) sr = AG.getspatialref(table) @@ -84,12 +87,12 @@ function read(ds, layer) end crs = sr.ptr == C_NULL ? nothing : GFT.WellKnownText(GFT.CRS(), AG.toWKT(sr)) geometrycolumns = Tuple(gnames) - metadata!(df, "crs", crs, style=:default) - metadata!(df, "geometrycolumns", geometrycolumns, style=:default) + metadata!(df, "crs", crs; style = :default) + metadata!(df, "geometrycolumns", geometrycolumns; style = :default) # Also add the GEOINTERFACE:property as a namespaced thing - metadata!(df, "GEOINTERFACE:crs", crs, style=:default) - metadata!(df, "GEOINTERFACE:geometrycolumns", geometrycolumns, style=:default) + metadata!(df, "GEOINTERFACE:crs", crs; style = :default) + metadata!(df, "GEOINTERFACE:geometrycolumns", geometrycolumns; style = :default) return df end @@ -98,12 +101,24 @@ end Write the provided `table` to `fn`. The `geom_column` is expected to hold ArchGDAL geometries. """ -function write(fn::AbstractString, table; layer_name::AbstractString="data", crs::Union{GFT.GeoFormat,Nothing}=getcrs(table), driver::Union{Nothing,AbstractString}=nothing, options::Dict{String,String}=Dict{String,String}(), geom_columns=getgeometrycolumns(table), kwargs...) +function write( + fn::AbstractString, + table; + layer_name::AbstractString = "data", + crs::Union{GFT.GeoFormat, Nothing} = getcrs(table), + driver::Union{Nothing, AbstractString} = nothing, + options::Dict{String, String} = Dict{String, String}(), + geom_columns = getgeometrycolumns(table), + chunksize = 20_000, + kwargs..., +) rows = Tables.rows(table) sch = Tables.schema(rows) # Determine geometry columns - isnothing(geom_columns) && error("Please set `geom_columns` kw or define `GeoInterface.geometrycolumns` for $(typeof(table))") + isnothing(geom_columns) && error( + "Please set `geom_columns` kw or define `GeoInterface.geometrycolumns` for $(typeof(table))", + ) if :geom_column in keys(kwargs) # backwards compatible geom_columns = (kwargs[:geom_column],) end @@ -113,7 +128,11 @@ function write(fn::AbstractString, table; layer_name::AbstractString="data", crs trait = AG.GeoInterface.geomtrait(getproperty(first(rows), geom_column)) ndim = AG.GeoInterface.ncoord(getproperty(first(rows), geom_column)) geom_type = get(lookup_type, (typeof(trait), ndim), nothing) - isnothing(geom_type) && throw(ArgumentError("Can't convert $trait with $ndim dimensions of column $geom_column to ArchGDAL yet.")) + isnothing(geom_type) && throw( + ArgumentError( + "Can't convert $trait with $ndim dimensions of column $geom_column to ArchGDAL yet.", + ), + ) push!(geom_types, geom_type) end @@ -130,10 +149,12 @@ function write(fn::AbstractString, table; layer_name::AbstractString="data", crs end # Figure out attributes - fields = Vector{Tuple{Symbol,DataType}}() + fields = Vector{Tuple{Symbol, DataType}}() for (name, type) in zip(sch.names, sch.types) if !(name in geom_columns) - AG.GeoInterface.isgeometry(type) && error("Did you mean to use the `geom_columns` argument to specify $name is a geometry?") + AG.GeoInterface.isgeometry(type) && error( + "Did you mean to use the `geom_columns` argument to specify $name is a geometry?", + ) types = Base.uniontypes(type) if length(types) == 1 push!(fields, (Symbol(name), type)) @@ -144,47 +165,105 @@ function write(fn::AbstractString, table; layer_name::AbstractString="data", crs end end end - AG.create( - fn, - driver=driver - ) do ds + AG.create(fn; driver = driver) do ds AG.newspatialref() do spatialref crs !== nothing && AG.importCRS!(spatialref, crs) - AG.createlayer( - name=layer_name, - geom=first(geom_types), # how to set the name though? - spatialref=spatialref, - options=stringlist(options) + + can_create_layer = AG.testcapability(ds, "CreateLayer") + can_use_transaction = AG.testcapability(ds, "Transactions") + + AG.createlayer(; + name = layer_name, + dataset = can_create_layer ? ds : AG.create(AG.getdriver("Memory")), + geom = first(geom_types), # how to set the name though? + spatialref = spatialref, + options = stringlist(options), ) do layer - for (i, (geom_column, geom_type)) in enumerate(zip(geom_columns, geom_types)) + for (i, (geom_column, geom_type)) in + enumerate(zip(geom_columns, geom_types)) if i > 1 AG.writegeomdefn!(layer, string(geom_column), geom_type) end end + fieldindices = Int[] for (name, type) in fields AG.createfielddefn(String(name), convert(AG.OGRFieldType, type)) do fd AG.setsubtype!(fd, convert(AG.OGRFieldSubType, type)) AG.addfielddefn!(layer, fd) end + push!(fieldindices, AG.findfieldindex(layer, name, false)) end - for row in rows - AG.createfeature(layer) do feature - for (i, (geom_column)) in enumerate(geom_columns) - AG.setgeom!(feature, i - 1, GeoInterface.convert(AG.IGeometry, getproperty(row, geom_column))) - end - for (name, _) in fields - field = getproperty(row, name) - if !ismissing(field) - AG.setfield!(feature, AG.findfieldindex(feature, name), getproperty(row, name)) - else - AG.GDAL.ogr_f_setfieldnull(feature.ptr, AG.findfieldindex(feature, name)) + + for chunk in Iterators.partition(rows, chunksize) + can_use_transaction && + AG.GDAL.gdaldatasetstarttransaction(ds.ptr, false) + + for row in chunk + AG.addfeature(layer) do feature + for (i, (geom_column)) in enumerate(geom_columns) + AG.GDAL.ogr_f_setgeomfielddirectly( + feature.ptr, + i - 1, + _convert( + AG.Geometry, + Tables.getcolumn(row, geom_column), + ), + ) + end + for (i, (name, _)) in zip(fieldindices, fields) + field = Tables.getcolumn(row, name) + if !ismissing(field) + AG.setfield!(feature, i, field) + else + AG.GDAL.ogr_f_setfieldnull(feature.ptr, i) + end end end end + if can_use_transaction + try + AG.GDAL.gdaldatasetcommittransaction(ds.ptr) + catch e + e isa AG.GDAL.GDALError && + AG.GDAL.gdaldatasetrollbacktransaction(ds.ptr) + rethrow(e) + end + end + end + if !can_create_layer + @warn "Can't create layers in this format, copying from memory instead." + AG.copy( + layer; + dataset = ds, + name = layer_name, + options = stringlist(options), + ) end - AG.copy(layer, dataset=ds, name=layer_name, options=stringlist(options)) end end end fn end + +# This should be upstreamed to ArchGDAL +const lookup_method = Dict{DataType, Function}( + GeoInterface.PointTrait => AG.unsafe_createpoint, + GeoInterface.MultiPointTrait => AG.unsafe_createmultipoint, + GeoInterface.LineStringTrait => AG.unsafe_createlinestring, + GeoInterface.LinearRingTrait => AG.unsafe_createlinearring, + GeoInterface.MultiLineStringTrait => AG.unsafe_createmultilinestring, + GeoInterface.PolygonTrait => AG.unsafe_createpolygon, + GeoInterface.MultiPolygonTrait => AG.unsafe_createmultipolygon, +) + +function _convert(::Type{T}, geom) where {T <: AG.Geometry} + f = get(lookup_method, typeof(GeoInterface.geomtrait(geom)), nothing) + isnothing(f) && error( + "Cannot convert an object of $(typeof(geom)) with the $(typeof(type)) trait (yet). Please report an issue.", + ) + return f(GeoInterface.coordinates(geom)) +end + +function _convert(::Type{T}, geom::AG.IGeometry) where {T <: AG.Geometry} + return AG.unsafe_clone(geom) +end diff --git a/src/utils.jl b/src/utils.jl index b142efb..df82781 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,4 @@ -function stringlist(dict::Dict{String,String}) +function stringlist(dict::Dict{String, String}) sv = Vector{String}() for (k, v) in pairs(dict) push!(sv, uppercase(string(k)) * "=" * string(v)) @@ -81,5 +81,6 @@ end # Since `DataFrameRow` is simply a view of a DataFrame, we can reach back # to the original DataFrame to get the metadata. -GeoInterface.geometrycolumns(row::DataFrameRow) = GeoInterface.geometrycolumns(getfield(row, :df)) # get the parent of the row view +GeoInterface.geometrycolumns(row::DataFrameRow) = + GeoInterface.geometrycolumns(getfield(row, :df)) # get the parent of the row view GeoInterface.crs(row::DataFrameRow) = GeoInterface.crs(getfield(row, :df)) # get the parent of the row view diff --git a/test/runtests.jl b/test/runtests.jl index 6ca7879..146bed3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,21 +14,41 @@ isdir(testdatadir) || mkdir(testdatadir) REPO_URL = "https://github.com/yeesian/ArchGDALDatasets/blob/master/" remotefiles = [ - ("ospy/data1/sites.dbf", "7df95edea06c46418287ae3430887f44f9116b29715783f7d1a11b2b931d6e7d"), - ("ospy/data1/sites.prj", "81fb1a246728609a446b25b0df9ede41c3e7b6a133ce78f10edbd2647fc38ce1"), - ("ospy/data1/sites.sbn", "198d9d695f3e7a0a0ac0ebfd6afbe044b78db3e685fffd241a32396e8b341ed3"), - ("ospy/data1/sites.sbx", "49bbe1942b899d52cf1d1b01ea10bd481ec40bdc4c94ff866aece5e81f2261f6"), - ("ospy/data1/sites.shp", "69af5a6184053f0b71f266dc54c944f1ec02013fb66dbb33412d8b1976d5ea2b"), - ("ospy/data1/sites.shx", "1f3da459ccb151958743171e41e6a01810b2a007305d55666e01d680da7bbf08"), + ( + "ospy/data1/sites.dbf", + "7df95edea06c46418287ae3430887f44f9116b29715783f7d1a11b2b931d6e7d", + ), + ( + "ospy/data1/sites.prj", + "81fb1a246728609a446b25b0df9ede41c3e7b6a133ce78f10edbd2647fc38ce1", + ), + ( + "ospy/data1/sites.sbn", + "198d9d695f3e7a0a0ac0ebfd6afbe044b78db3e685fffd241a32396e8b341ed3", + ), + ( + "ospy/data1/sites.sbx", + "49bbe1942b899d52cf1d1b01ea10bd481ec40bdc4c94ff866aece5e81f2261f6", + ), + ( + "ospy/data1/sites.shp", + "69af5a6184053f0b71f266dc54c944f1ec02013fb66dbb33412d8b1976d5ea2b", + ), + ( + "ospy/data1/sites.shx", + "1f3da459ccb151958743171e41e6a01810b2a007305d55666e01d680da7bbf08", + ), ] -@info "Downloading test files..." for (f, sha) in remotefiles localfn = joinpath(testdatadir, basename(f)) url = REPO_URL * f * "?raw=true" - PlatformEngines.download_verify(url, sha, localfn; force=true) + PlatformEngines.download_verify(url, sha, localfn; force = true, quiet_download = false) end -unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\",DATUM[\"unknown\",SPHEROID[\"unknown\",6378137,298.257223563]],PRIMEM[\"Greenwich\",0],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AXIS[\"Latitude\",NORTH],AXIS[\"Longitude\",EAST]]") +unknown_crs = GFT.WellKnownText( + GFT.CRS(), + "GEOGCS[\"Undefined geographic SRS\",DATUM[\"unknown\",SPHEROID[\"unknown\",6378137,298.257223563]],PRIMEM[\"Greenwich\",0],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AXIS[\"Latitude\",NORTH],AXIS[\"Longitude\",EAST]]", +) @testset "GeoDataFrames.jl" begin fn = joinpath(testdatadir, "sites.shp") @@ -80,10 +100,18 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", @testset "Read self written file" begin # Save table with a few random points - table = DataFrame(geometry=AG.createpoint.(coords), name="test") + table = DataFrame(; geometry = AG.createpoint.(coords), name = "test") GDF.write(joinpath(testdatadir, "test_points.shp"), table) - GDF.write(joinpath(testdatadir, "test_points.gpkg"), table; layer_name="test_points") - GDF.write(joinpath(testdatadir, "test_points.geojson"), table; layer_name="test_points") + GDF.write( + joinpath(testdatadir, "test_points.gpkg"), + table; + layer_name = "test_points", + ) + GDF.write( + joinpath(testdatadir, "test_points.geojson"), + table; + layer_name = "test_points", + ) ntable = GDF.read(joinpath(testdatadir, "test_points.shp")) @test nrow(ntable) == 10 @@ -92,8 +120,12 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", ntable = GDF.read(joinpath(testdatadir, "test_points.geojson")) @test nrow(ntable) == 10 - tablez = DataFrame(geometry=AG.createpoint.(coords3), name="test") - GDF.write(joinpath(testdatadir, "test_pointsz.gpkg"), tablez; layer_name="test_points") + tablez = DataFrame(; geometry = AG.createpoint.(coords3), name = "test") + GDF.write( + joinpath(testdatadir, "test_pointsz.gpkg"), + tablez; + layer_name = "test_points", + ) ntable = GDF.read(joinpath(testdatadir, "test_pointsz.gpkg")) @test GI.ncoord(ntable.geometry[1]) == 3 end @@ -102,26 +134,29 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", t = GDF.read(fn) # Save table from reading - GDF.write(joinpath(testdatadir, "test_read.shp"), t; layer_name="test_coastline") - GDF.write(joinpath(testdatadir, "test_read.gpkg"), t; layer_name="test_coastline") - GDF.write(joinpath(testdatadir, "test_read.geojson"), t; layer_name="test_coastline") - + GDF.write(joinpath(testdatadir, "test_read.shp"), t; layer_name = "test_coastline") + GDF.write(joinpath(testdatadir, "test_read.gpkg"), t; layer_name = "test_coastline") + GDF.write( + joinpath(testdatadir, "test_read.geojson"), + t; + layer_name = "test_coastline", + ) end @testset "Write shapefile with non-GDAL types" begin coords = collect(zip(rand(Float32, 2), rand(Float32, 2))) - t = DataFrame( - geometry=AG.createpoint.(coords), - name=["test", "test2"], - flag=UInt8[typemin(UInt8), typemax(UInt8)], - ex1=Int16[typemin(Int8), typemax(Int8)], - ex2=Int32[typemin(UInt16), typemax(UInt16)], - ex3=Int64[typemin(UInt32), typemax(UInt32)], - check=[false, true], - z=Float32[Float32(8), Float32(-1)], - y=Float16[Float16(8), Float16(-1)], - odd=[1, missing], - date=[DateTime("2022-03-31T15:38:41"), DateTime("2022-03-31T15:38:41")] + t = DataFrame(; + geometry = AG.createpoint.(coords), + name = ["test", "test2"], + flag = UInt8[typemin(UInt8), typemax(UInt8)], + ex1 = Int16[typemin(Int8), typemax(Int8)], + ex2 = Int32[typemin(UInt16), typemax(UInt16)], + ex3 = Int64[typemin(UInt32), typemax(UInt32)], + check = [false, true], + z = Float32[Float32(8), Float32(-1)], + y = Float16[Float16(8), Float16(-1)], + odd = [1, missing], + date = [DateTime("2022-03-31T15:38:41"), DateTime("2022-03-31T15:38:41")], ) GDF.write(joinpath(testdatadir, "test_exotic.shp"), t) GDF.write(joinpath(testdatadir, "test_exotic.gpkg"), t) @@ -146,7 +181,7 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", end @testset "Spatial operations" begin - table = DataFrame(geometry=AG.createpoint.(coords), name="test") + table = DataFrame(; geometry = AG.createpoint.(coords), name = "test") # Buffer to also write polygons table.geometry = AG.buffer(table.geometry, 10) @@ -156,30 +191,56 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", end @testset "Reproject" begin - table = DataFrame(geometry=AG.createpoint.([[0, 0, 0]]), name="test") + table = DataFrame(; geometry = AG.createpoint.([[0, 0, 0]]), name = "test") AG.reproject(table.geometry, GFT.EPSG(4326), GFT.EPSG(28992)) @test GDF.AG.getpoint(table.geometry[1], 0)[1] ≈ -587791.596556932 - GDF.write(joinpath(testdatadir, "test_reprojection.gpkg"), table; crs=GFT.EPSG(28992)) + GDF.write( + joinpath(testdatadir, "test_reprojection.gpkg"), + table; + crs = GFT.EPSG(28992), + ) end @testset "Kwargs" begin - table = DataFrame(foo=AG.createpoint.([[0, 0, 0]]), name="test") - GDF.write(joinpath(testdatadir, "test_options1.gpkg"), table; geom_column=:foo) - GDF.write(joinpath(testdatadir, "test_options2.gpkg"), table; geom_columns=Set((:foo,))) - - table = DataFrame(foo=AG.createpoint.([[0, 0, 0]]), bar=AG.createpoint.([[0, 0, 0]]), name="test") - @test_throws Exception GDF.write(joinpath(testdatadir, "test_options3.gpkg"), table; geom_column=:foo) - GDF.write(joinpath(testdatadir, "test_options3.gpkg"), table; geom_columns=Set((:foo, :bar))) + table = DataFrame(; foo = AG.createpoint.([[0, 0, 0]]), name = "test") + GDF.write(joinpath(testdatadir, "test_options1.gpkg"), table; geom_column = :foo) + GDF.write( + joinpath(testdatadir, "test_options2.gpkg"), + table; + geom_columns = Set((:foo,)), + ) - table = DataFrame(foo=AG.createpoint.([[0, 0, 0]]), name="test") - GDF.write(joinpath(testdatadir, "test_options4.gpkg"), table; options=Dict( - "GEOMETRY_NAME" => "bar", "DESCRIPTION" => "Written by GeoDataFrames.jl"), geom_column=:foo) + table = DataFrame(; + foo = AG.createpoint.([[0, 0, 0]]), + bar = AG.createpoint.([[0, 0, 0]]), + name = "test", + ) + @test_throws Exception GDF.write( + joinpath(testdatadir, "test_options3.gpkg"), + table; + geom_column = :foo, + ) # wrong argument + @test_throws AG.GDAL.GDALError GDF.write( + joinpath(testdatadir, "test_options3.gpkg"), + table; + geom_columns = Set((:foo, :bar)), + ) # two geometry columns + table = DataFrame(; foo = AG.createpoint.([[0, 0, 0]]), name = "test") + GDF.write( + joinpath(testdatadir, "test_options4.gpkg"), + table; + options = Dict( + "GEOMETRY_NAME" => "bar", + "DESCRIPTION" => "Written by GeoDataFrames.jl", + ), + geom_column = :foo, + ) end @testset "GeoInterface" begin tfn = joinpath(testdatadir, "test_geointerface.gpkg") - table = [(; foo=AG.createpoint(1.0, 2.0), name="test")] + table = [(; foo = AG.createpoint(1.0, 2.0), name = "test")] @test_throws Exception GDF.write(tfn, table) GI.isfeaturecollection(::Vector{<:NamedTuple}) = true GI.geomtrait(::Vector{<:NamedTuple}) = GI.FeatureCollectionTrait() # TODO Make issue GeoInterface.jl @@ -191,11 +252,11 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", @testset "Metadata" begin tfn = joinpath(testdatadir, "test_meta.gpkg") - table = DataFrame(bar=AG.createpoint(1.0, 2.0), name="test") + table = DataFrame(; bar = AG.createpoint(1.0, 2.0), name = "test") @test_throws Exception GDF.write(tfn, table) - meta = Dict{String,Any}("crs" => nothing, "geometrycolumns" => (:bar,)) + meta = Dict{String, Any}("crs" => nothing, "geometrycolumns" => (:bar,)) for pair in meta - metadata!(table, pair.first, pair.second, style=:default) + metadata!(table, pair.first, pair.second; style = :default) end @test isfile(GDF.write(tfn, table)) t = GDF.read(tfn) @@ -204,5 +265,4 @@ unknown_crs = GFT.WellKnownText(GFT.CRS(), "GEOGCS[\"Undefined geographic SRS\", @test meta["GEOINTERFACE:geometrycolumns"] == meta["geometrycolumns"] == (:bar,) @test isempty(setdiff(keys(meta), metadatakeys(t))) end - end