Treat arrays on disk as if they were regular Julia arrays
getindex
/setindex!
map
and broadcast
reduce
and mapreduce
iterate
, collect
, zip
…AbstractArray
s are forstruct AccessCountArray{T,N,P<:AbstractArray{T,N}} <: AbstractArray{T,N}
parent::P
accesscount::Base.RefValue
end
Base.size(a::AccessCountArray) = size(a.parent)
Base.IndexStyle(a::Type{<:AccessCountArray}) = IndexLinear()
function Base.getindex(a::AccessCountArray, i::Int)
a.accesscount[] += 1
a.parent[i]
end
AccessCountArray(a::AbstractArray) = AccessCountArray(a,Ref(0))
ca = AccessCountArray(reshape(1:50,5,10))
5×10 AccessCountArray{Int64, 2, Base.ReshapedArray{Int64, 2, UnitRange{Int64}, Tuple{}}}:
1 6 11 16 21 26 31 36 41 46
2 7 12 17 22 27 32 37 42 47
3 8 13 18 23 28 33 38 43 48
4 9 14 19 24 29 34 39 44 49
5 10 15 20 25 30 35 40 45 50
5×10 Matrix{Float64}:
4.0 9.0 14.0 19.0 24.0 29.0 34.0 39.0 44.0 49.0
5.0 10.0 15.0 20.0 25.0 30.0 35.0 40.0 45.0 50.0
6.0 11.0 16.0 21.0 26.0 31.0 36.0 41.0 46.0 51.0
7.0 12.0 17.0 22.0 27.0 32.0 37.0 42.0 47.0 52.0
8.0 13.0 18.0 23.0 28.0 33.0 38.0 43.0 48.0 53.0
Assumption: Access to single elements is efficient
Example implementation for NetCDF.jl
# Implement the DiskArrays interface
import DiskArrays: AbstractDiskArray, readblock!, writeblock!
mutable struct NcVar{T,N,M} <: AbstractDiskArray{T,N}
...
end
function readblock!(v::NcVar{<:Any,N}, aout, r::Vararg{<:AbstractUnitRange,N}) where N
@assert length(r) == ndims(aout)
@assert size(aout) == length.(r)
...
end
function writeblock!(v::NcVar{<:Any,N}, a, r::Vararg{<:AbstractUnitRange,N}) where N
...
end
using DiskArrays: DiskArrays, AbstractDiskArray
struct AccessCountDiskArray{T,N,P<:AbstractArray{T,N}} <: AbstractDiskArray{T,N}
parent::P
accesscount::Base.RefValue
end
Base.size(a::AccessCountDiskArray) = size(a.parent)
function DiskArrays.readblock!(a::AccessCountDiskArray, aout, i::AbstractUnitRange...)
aout .= view(a.parent,i...)
a.accesscount[] += 1
end
AccessCountDiskArray(a::AbstractArray) = AccessCountDiskArray(a,Ref(0))
da = AccessCountDiskArray(reshape(1:50,5,10))
5×10 AccessCountDiskArray{Int64, 2, Base.ReshapedArray{Int64, 2, UnitRange{Int64}, Tuple{}}}
Unchunked
5×10 DiskArrays.BroadcastDiskArray{Float64, 2, Base.Broadcast.Broadcasted{DiskArrays.ChunkStyle{2}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(+), Tuple{AccessCountDiskArray{Int64, 2, Base.ReshapedArray{Int64, 2, UnitRange{Int64}, Tuple{}}}, Float64}}}
Chunked: (
[5]
[10]
)
5×10 Matrix{Float64}:
4.0 9.0 14.0 19.0 24.0 29.0 34.0 39.0 44.0 49.0
5.0 10.0 15.0 20.0 25.0 30.0 35.0 40.0 45.0 50.0
6.0 11.0 16.0 21.0 26.0 31.0 36.0 41.0 46.0 51.0
7.0 12.0 17.0 22.0 27.0 32.0 37.0 42.0 47.0 52.0
8.0 13.0 18.0 23.0 28.0 33.0 38.0 43.0 48.0 53.0
1×10 Matrix{Float64}:
30.0 55.0 80.0 105.0 130.0 155.0 180.0 205.0 230.0 255.0
readblock!
and writeblock!
to read and write n-dimensional rectangles of their datareadblock!
import DiskArrays: haschunks, eachchunk, Chunked, Unchunked, GridChunks
haschunks(v::NcVar) = all(iszero,v.chunksize) ? Unchunked(SubRanges()) : Chunked()
eachchunk(v::NcVar) = getchunksize(v)
getchunksize(v::NcVar) = getchunksize(haschunks(v),v)
getchunksize(::Chunked, v::NcVar) = GridChunks(v,reverse(map(Int64,v.chunksize)))
getchunksize(::Unchunked, v::NcVar) = estimate_chunksize(v)
haschunks
is not defined for an AbstractDiskArray
Unchunked
so that data is split along the last dimension(s) so that parts fit in memoryhaschunks
returns Chunked
, a method for eachchunk
needs to be implementedusing DiskArrays: IrregularChunks, RegularChunks, GridChunks
dayinmonth = [31,28,31,30,31,30,31,31,30,31,30,31]
monthlychunks = IrregularChunks(chunksizes = dayinmonth)
otherchunks = RegularChunks(200,0,200)
chunks = GridChunks(otherchunks,monthlychunks)
1×12 GridChunks{2, Tuple{RegularChunks, IrregularChunks}}:
(1:200, 1:31) (1:200, 32:59) … (1:200, 305:334) (1:200, 335:365)
Doing something with chunk (1:200, 1:31)
Doing something with chunk (1:200, 32:59)
Doing something with chunk (1:200, 60:90)
Doing something with chunk (1:200, 91:120)
Doing something with chunk (1:200, 121:151)
Doing something with chunk (1:200, 152:181)
Doing something with chunk (1:200, 182:212)
Doing something with chunk (1:200, 213:243)
Doing something with chunk (1:200, 244:273)
Doing something with chunk (1:200, 274:304)
Doing something with chunk (1:200, 305:334)
Doing something with chunk (1:200, 335:365)
using Zarr
url = "https://s3.bgc-jena.mpg.de:9000/pyramids/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2020-fv4.0.zarr"
array = zopen(url)["agb"]
DiskArrays.eachchunk(array)
396×154×1 GridChunks{3, Tuple{RegularChunks, RegularChunks, RegularChunks}}:
[:, :, 1] =
(1:1024, 1:1024, 1:1) … (1:1024, 156673:157500, 1:1)
(1025:2048, 1:1024, 1:1) (1025:2048, 156673:157500, 1:1)
(2049:3072, 1:1024, 1:1) (2049:3072, 156673:157500, 1:1)
(3073:4096, 1:1024, 1:1) (3073:4096, 156673:157500, 1:1)
(4097:5120, 1:1024, 1:1) (4097:5120, 156673:157500, 1:1)
(5121:6144, 1:1024, 1:1) … (5121:6144, 156673:157500, 1:1)
(6145:7168, 1:1024, 1:1) (6145:7168, 156673:157500, 1:1)
(7169:8192, 1:1024, 1:1) (7169:8192, 156673:157500, 1:1)
(8193:9216, 1:1024, 1:1) (8193:9216, 156673:157500, 1:1)
(9217:10240, 1:1024, 1:1) (9217:10240, 156673:157500, 1:1)
⋮ ⋱
(396289:397312, 1:1024, 1:1) (396289:397312, 156673:157500, 1:1)
(397313:398336, 1:1024, 1:1) (397313:398336, 156673:157500, 1:1)
(398337:399360, 1:1024, 1:1) (398337:399360, 156673:157500, 1:1)
(399361:400384, 1:1024, 1:1) … (399361:400384, 156673:157500, 1:1)
(400385:401408, 1:1024, 1:1) (400385:401408, 156673:157500, 1:1)
(401409:402432, 1:1024, 1:1) (401409:402432, 156673:157500, 1:1)
(402433:403456, 1:1024, 1:1) (402433:403456, 156673:157500, 1:1)
(403457:404480, 1:1024, 1:1) (403457:404480, 156673:157500, 1:1)
(404481:405000, 1:1024, 1:1) … (404481:405000, 156673:157500, 1:1)
mapslices
-like operations on large datasetsgetindex
calls into dense readblock!
callsa[[1,2,3,6,7,8]]
readblock!
calls read bounding boxreadblock!
calls while not transferring unnecessary dataA Big thank you to all contributors
DiskArray
s is fundamentally brokenzip
-ping DiskArrays with different chunkssetindex!
not as well tested as getindex
(yet)