# Icechunk with dummy data

This demo illustrates how to use Icechunk as a Zarr store

In [2]:
import math

import numpy as np
import zarr
from icechunk import IcechunkStore, StorageConfig

## Create a new Zarr store backed by Icechunk

This example uses an in-memory store.

In [3]:
store = await IcechunkStore.create(
 storage=StorageConfig.memory("icechunk-demo"),
 mode="w",
)
store



This dictionary will contain array paths and data that were written to Icechunk, so that we can check correctness.

In [4]:
expected = {}

These two utility functions generate and write dummy array data to a group.

In [5]:
def generate_array_chunks(size: int, dtype=np.int32):
 # dim sizes
 nz = 64
 nt = 128
 nx = ny = int(math.sqrt(size / nz / nt))

 # chunk sizes
 ct = 2
 cz = 8
 cx = max(nx // 3, 1)
 cy = max(ny // 2, 1)
 chunk_shape = (cx, cy, cz, ct)
 shape = (nx, ny, nz, nt)

 array = np.arange(nx * ny * nz * nt, dtype=dtype).reshape(shape)

 return array, chunk_shape


def create_array(*, group, name, size, dtype, fill_value) -> np.ndarray:
 dims = ("x", "y", "z", "t")
 attrs = {"description": "icechunk test data"}

 array, chunk_shape = generate_array_chunks(size=size, dtype=dtype)

 group.create_array(
 name=name,
 shape=array.shape,
 dtype=dtype,
 fill_value=fill_value,
 chunk_shape=chunk_shape,
 dimension_names=dims,
 attributes=attrs,
 data=array,
 exists_ok=True,
 )

 return array

## A versioned transactional Zarr store

### Open the root group, write an attribute, commit

In [6]:
root_group = zarr.group(store=store, overwrite=True)
root_group.attrs["foo"] = "foo"
dict(root_group.attrs) # check that it was written

{'foo': 'foo'}

Commit that change

In [7]:
first_commit = await store.commit("wrote a root group attribute")
first_commit

'2471WQXNN789CTT07172HDDBQG'

### Add a array to the root group

We save the created array in `expected` to check that the write was correct (later).

In [8]:
expected["root-foo"] = create_array(
 group=root_group,
 name="root-foo",
 size=1 * 1024 * 256,
 dtype=np.int32,
 fill_value=-1,
)

In [9]:
print(root_group.members())

store_path 
{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}
(('root-foo', /root-foo shape=(5, 5, 64, 128) dtype=int32>),)


In [10]:
dict(root_group["root-foo"].attrs)

{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}


{'description': 'icechunk test data'}

In [11]:
root_group["root-foo"].attrs["update"] = "new attr"

{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}


In [12]:
second_commit = await store.commit("added array, updated attr")
second_commit

'SQ4T6Y45DXY9F0EYXE7ECBWHPC'

In [13]:
assert len(root_group["root-foo"].attrs) == 2
assert len(root_group.members()) == 1

{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data",
 "update": "new attr"
 }
}
store_path 
{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data",
 "update": "new attr"
 }
}


### Commiting when not on `HEAD` will fail.

In [14]:
await store.checkout(first_commit)
root_group.attrs["update"] = "new attr 2"

try:
 await store.commit("new attr 2")
except ValueError as e:
 print(e)
else:
 raise ValueError("should have failed")

store error: all commits must be made on a branch


### Checkout `HEAD`, make a change, and commit.

In [19]:
await store.reset()
await store.checkout(branch="main")
root_group["root-foo"].attrs["update"] = "new attr 2"
third_commit = await store.commit("new attr 2")
third_commit

{
 "shape": [
 5,
 5,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data",
 "update": "new attr"
 }
}


'DZNCW2X281JE5PXSE15N85THAW'

In [21]:
root_group.attrs["update"] = "new attr 2"
fourth_commit = await store.commit("rewrote array")
fourth_commit

'DP8CF4KE7XYHVD0H1GPZ8H58V0'

### Create a hierarchy

In [22]:
{k: v.dtype for k, v in expected.items()}

{'root-foo': dtype('int32')}

In [23]:
newgroup = zarr.group(store=store, path="group1/")
expected["group1/foo1"] = create_array(
 group=newgroup, name="foo1", dtype=np.float32, size=1 * 1024 * 128, fill_value=-1234
)
expected["group1/foo2"] = create_array(
 group=newgroup, name="foo2", dtype=np.float16, size=1 * 1024 * 64, fill_value=-1234
)
newgroup = zarr.group(store=store, path="group2/")
expected["group2/foo3"] = create_array(
 group=newgroup, name="foo3", dtype=np.int64, size=1 * 1024 * 32, fill_value=-1234
)
fifth_commit = await store.commit("added groups and arrays")
fifth_commit

'3ZBWXTZEYPJH8MEZVKM5MW7S0G'

### Overwrite an array

In [24]:
expected["root-foo"] = create_array(
 group=root_group,
 name="root-foo",
 size=1 * 1024 * 128,
 dtype=np.int32,
 fill_value=-1,
)

In [25]:
await store.commit("overwrote root-foo")

'B33DVM1FBXFYB0S1EVH8SHG29G'

### Examine the hierarchy

In [26]:
root_group.members()

store_path 
{
 "shape": [
 4,
 4,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}


(('group1',
 Group(_async_group=/group1>)),
 ('root-foo',
 /root-foo shape=(4, 4, 64, 128) dtype=int32>),
 ('group2',
 Group(_async_group=/group2>)))

In [27]:
root_group["group1"].members()

store_path group1
{
 "shape": [
 4,
 4,
 64,
 128
 ],
 "data_type": "float32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1234.0,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}
{
 "shape": [
 2,
 2,
 64,
 128
 ],
 "data_type": "float16",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 1,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1234.0,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}


(('foo1',
 /group1/foo1 shape=(4, 4, 64, 128) dtype=float32>),
 ('foo2',
 /group1/foo2 shape=(2, 2, 64, 128) dtype=float16>))

In [28]:
root_group["group2"].members()

store_path group2
{
 "shape": [
 2,
 2,
 64,
 128
 ],
 "data_type": "int64",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 1,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1234,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}


(('foo3',
 /group2/foo3 shape=(2, 2, 64, 128) dtype=int64>),)

### Append

In [29]:
array = root_group["group2/foo3"]
print(array)

array = array.resize((array.shape[0] * 2, *array.shape[1:]))
print(array)
array[array.shape[0] // 2 :, ...] = expected["group2/foo3"]
print(array[2:, 0, 0, 0])
expected["group2/foo3"] = np.concatenate([expected["group2/foo3"]] * 2, axis=0)

await store.commit("appended to group2/foo3")

{
 "shape": [
 2,
 2,
 64,
 128
 ],
 "data_type": "int64",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 1,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1234,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}
/group2/foo3 shape=(2, 2, 64, 128) dtype=int64>
/group2/foo3 shape=(4, 2, 64, 128) dtype=int64>
[ 0 16384]


'JG601CAP09Q7P19RQ7JSH3AWNR'

### Check that values are correct

In [30]:
import time

for key, value in expected.items():
 print(key)
 tic = time.time()
 array = root_group[key]
 assert array.dtype == value.dtype, (array.dtype, value.dtype)
 print(f"numchunks: {math.prod(s // c for s, c in zip(array.shape, array.chunks, strict=False))}")
 np.testing.assert_array_equal(array[:], value)
 print(time.time() - tic)

root-foo
{
 "shape": [
 4,
 4,
 64,
 128
 ],
 "data_type": "int32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}
numchunks: 4096
0.486346960067749
group1/foo1
{
 "shape": [
 4,
 4,
 64,
 128
 ],
 "data_type": "float32",
 "chunk_grid": {
 "name": "regular",
 "configuration": {
 "chunk_shape": [
 1,
 2,
 8,
 2
 ]
 }
 },
 "chunk_key_encoding": {
 "name": "default",
 "configuration": {
 "separator": "/"
 }
 },
 "fill_value": -1234.0,
 "codecs": [
 {
 "name": "bytes",
 "configuration": {
 "endian": "little"
 }
 }
 ],
 "dimension_names": [
 "x",
 "y",
 "z",
 "t"
 ],
 "attributes": {
 "description": "icechunk test data"
 }
}
numchunks: 4096
0.

change values of "group1/foo1"