Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/4035.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added :class:`zarr.ChunkLayout`, a public, typed distillation of an array's declared chunk structure (chunk grid metadata plus chunk-structuring codecs) in the form :func:`zarr.create_array` accepts. ``Array.chunk_layout`` and ``Array.is_sharded`` answer "is this grid regular?" and "is this array sharded?" for regular *and* rectilinear grids without catching ``NotImplementedError`` or importing private metadata classes, and support reconstruction round-trips for downstream libraries such as xarray. Codecs gain an ``inner_chunk_layout()`` hook (overridden by ``ShardingCodec``) so sharding-like codecs participate without callers special-casing codec classes.
90 changes: 50 additions & 40 deletions design/chunk-grid.md

Large diffs are not rendered by default.

353 changes: 353 additions & 0 deletions design/chunk-layout.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
zeros_like,
)
from zarr.core.array import Array, AsyncArray
from zarr.core.chunk_layouts import ChunkLayout
from zarr.core.config import config
from zarr.core.group import AsyncGroup, Group

Expand Down Expand Up @@ -146,6 +147,7 @@ def set_format(log_format: str) -> None:
"Array",
"AsyncArray",
"AsyncGroup",
"ChunkLayout",
"Group",
"__version__",
"array",
Expand Down
13 changes: 13 additions & 0 deletions src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from zarr.abc.store import ByteGetter, ByteSetter, Store
from zarr.core.array_spec import ArraySpec
from zarr.core.chunk_layouts import ChunkLayout
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
from zarr.core.indexing import SelectorTuple
from zarr.core.metadata import ArrayMetadata
Expand Down Expand Up @@ -160,6 +161,18 @@ def validate(
The array chunk grid metadata
"""

def inner_chunk_layout(self) -> ChunkLayout | None:
"""The chunk structure this codec creates inside each chunk it encodes.

``None`` (the default) means chunks are opaque -- this codec does not
subdivide them. Codecs that subdivide chunks, such as
:class:`zarr.codecs.ShardingCodec`, override this to report the
sub-chunk structure. Consumers should treat this as an optional
protocol member (``getattr(codec, "inner_chunk_layout", lambda: None)``)
so third-party codecs that predate it keep working.
"""
return None

async def _decode_single(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI:
raise NotImplementedError # pragma: no cover

Expand Down
12 changes: 12 additions & 0 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from collections.abc import Iterator
from typing import Self

from zarr.core.chunk_layouts import ChunkLayout
from zarr.core.common import JSON
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType

Expand Down Expand Up @@ -402,6 +403,17 @@ def to_dict(self) -> dict[str, JSON]:
},
}

def inner_chunk_layout(self) -> ChunkLayout:
"""The sub-chunk structure this sharding codec declares inside each shard.

Nested sharding falls out of the recursion: if this shard's single inner
codec is itself a sharding codec, its layout becomes ``inner.inner``.
"""
from zarr.core.chunk_layouts import ChunkLayout

inner = self.codecs[0].inner_chunk_layout() if len(self.codecs) == 1 else None
return ChunkLayout(chunks=tuple(self.chunk_shape), inner=inner)

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
shard_spec = self._get_chunk_spec(array_spec)
evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs)
Expand Down
41 changes: 41 additions & 0 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
from zarr.abc.codec import CodecPipeline
from zarr.abc.store import Store
from zarr.codecs.sharding import IndexLocation
from zarr.core.chunk_layouts import ChunkLayout
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar
from zarr.storage import StoreLike
from zarr.types import AnyArray, AnyAsyncArray, ArrayV2, ArrayV3, AsyncArrayV2, AsyncArrayV3
Expand Down Expand Up @@ -896,6 +897,30 @@ def shards(self) -> tuple[int, ...] | None:
"""
return self.metadata.shards

@property
def chunk_layout(self) -> ChunkLayout:
"""The declared chunk structure of the array.

A distillation of the chunk grid metadata and sharding codecs into the
form :func:`zarr.create_array` accepts, for every grid kind without
exceptions or private imports. Unlike :attr:`chunks` / :attr:`shards`,
this does not raise for rectilinear grids. See ``zarr.ChunkLayout``.
"""
from zarr.core.chunk_layouts import ChunkLayout

return ChunkLayout.from_metadata(self.metadata)

@property
def is_sharded(self) -> bool:
"""True if this array's chunks have internal sub-chunk structure (sharding).

Derived from the codec pipeline rather than from :attr:`chunk_layout`,
so it answers correctly even for grid kinds ``chunk_layout`` cannot
distill.
"""
codecs: tuple[Codec, ...] = getattr(self.metadata, "codecs", ())
return bool(codecs) and codecs[0].inner_chunk_layout() is not None

@property
def size(self) -> int:
"""Returns the total number of elements in the array
Expand Down Expand Up @@ -2051,6 +2076,22 @@ def shards(self) -> tuple[int, ...] | None:
"""
return self.async_array.shards

@property
def chunk_layout(self) -> ChunkLayout:
"""The declared chunk structure of the array.

A distillation of the chunk grid metadata and sharding codecs into the
form :func:`zarr.create_array` accepts, for every grid kind without
exceptions or private imports. Unlike :attr:`chunks` / :attr:`shards`,
this does not raise for rectilinear grids. See ``zarr.ChunkLayout``.
"""
return self.async_array.chunk_layout

@property
def is_sharded(self) -> bool:
"""True if this array's chunks have internal sub-chunk structure (sharding)."""
return self.async_array.is_sharded

@property
def size(self) -> int:
"""Returns the total number of elements in the array.
Expand Down
20 changes: 13 additions & 7 deletions src/zarr/core/chunk_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,15 @@
"""


class ChunkLayout(NamedTuple):
class _ResolvedChunks(NamedTuple):
"""Result of resolving user `chunks`/`shards` into grid metadata inputs.

Internal, creation-time only. Not to be confused with the public
:class:`zarr.ChunkLayout` (``zarr.core.chunk_layouts``), which is the
canonical, extent-free distillation consumers introspect; this type
carries normalized per-dimension edge arrays for the metadata-construction
path. See ``design/chunk-layout.md`` § "Naming collision".

outer_chunks
Chunk sizes for the chunk grid metadata. When sharding is active
these are the shard sizes; otherwise they are the user's chunk sizes.
Expand All @@ -66,7 +72,7 @@ class ChunkLayout(NamedTuple):
"""

outer_chunks: ChunksTuple
inner: ChunkLayout | None = None
inner: _ResolvedChunks | None = None


@dataclass(frozen=True)
Expand Down Expand Up @@ -871,7 +877,7 @@ def resolve_outer_and_inner_chunks(
chunks: ChunksTuple,
shard_shape: ShardsLike | None,
item_size: int,
) -> ChunkLayout:
) -> _ResolvedChunks:
"""Resolve user `chunks`/`shards` into outer and inner chunk specs.

Parameters
Expand All @@ -890,18 +896,18 @@ def resolve_outer_and_inner_chunks(

Returns
-------
ChunkLayout
_ResolvedChunks
`outer_chunks` is the `ChunksTuple` for chunk grid
metadata. `inner` holds the sub-chunk structure for
`ShardingCodec`, or is `None` when sharding is not active.
"""
if shard_shape is None:
return ChunkLayout(outer_chunks=chunks)
return _ResolvedChunks(outer_chunks=chunks)

# Rectilinear shards: normalize the nested sequence directly.
if _is_rectilinear_chunks(shard_shape):
outer = normalize_chunks_nd(shard_shape, array_shape)
return ChunkLayout(outer_chunks=outer, inner=ChunkLayout(outer_chunks=chunks))
return _ResolvedChunks(outer_chunks=outer, inner=_ResolvedChunks(outer_chunks=chunks))

# Extract the flat chunk shape (first size per dimension) for arithmetic.
chunk_shape_flat = as_regular_shape(chunks)
Expand Down Expand Up @@ -937,4 +943,4 @@ def resolve_outer_and_inner_chunks(
shard_flat = cast("tuple[int, ...]", shard_shape)

outer = normalize_chunks_nd(shard_flat, array_shape)
return ChunkLayout(outer_chunks=outer, inner=ChunkLayout(outer_chunks=chunks))
return _ResolvedChunks(outer_chunks=outer, inner=_ResolvedChunks(outer_chunks=chunks))
112 changes: 112 additions & 0 deletions src/zarr/core/chunk_layouts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# DRAFT — public chunk-structure introspection (design/chunk-layout.md).
"""Public, typed chunk-structure introspection for zarr arrays.

``ChunkLayout`` distills the *declared* chunk structure of an array -- its chunk
grid metadata together with the chunk-structuring codecs in its pipeline -- into
the form a consumer can feed back into :func:`zarr.create_array`. See
``design/chunk-layout.md`` for the full rationale.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import ArrayV3Metadata


@dataclass(frozen=True, kw_only=True)
class ChunkLayout:
"""Declared chunk structure of an array.

A distillation of the chunk grid metadata and sharding codec
configuration. Extent-free: sizes are as declared, never clipped
to the array shape. Canonical: a dimension whose declared edge
lengths are all equal is normalized to the bare uniform size, so
layouts of the same abstract grid compare equal regardless of the
metadata that declared them.
"""

chunks: tuple[int | tuple[int, ...], ...]
"""Per-dimension chunk spec at this level: a bare int (uniform size)
or explicit edge lengths -- the same union ``create_array`` accepts."""

inner: ChunkLayout | None = None
"""Structure within each chunk of this level, or None if chunks are
opaque (no sharding)."""

def __post_init__(self) -> None:
# Reuse the metadata-layer validator (positive ints; non-empty edge
# tuples), which also normalizes inner sequences to tuples and is
# precisely typed as ``Sequence[int | Sequence[int]]``.
from zarr.core.metadata.v3 import _validate_chunk_shapes

validated = _validate_chunk_shapes(self.chunks)
# canonicalization: a uniform edge tuple collapses to its int
object.__setattr__(
self,
"chunks",
tuple(c[0] if isinstance(c, tuple) and len(set(c)) == 1 else c for c in validated),
)
if self.inner is not None and self.inner.ndim != self.ndim:
raise ValueError(f"inner layout has {self.inner.ndim} dimensions, expected {self.ndim}")

@property
def ndim(self) -> int:
return len(self.chunks)

@property
def is_regular(self) -> bool:
"""True if every dimension at this level has one uniform chunk size."""
return all(isinstance(c, int) for c in self.chunks)

@property
def is_sharded(self) -> bool:
"""True if chunks at this level have internal sub-chunk structure."""
return self.inner is not None

@property
def flattened_levels(self) -> tuple[ChunkLayout, ...]:
"""All nesting levels, outermost (storage granularity) to innermost."""
return (self, *(self.inner.flattened_levels if self.inner is not None else ()))

@property
def innermost(self) -> ChunkLayout:
"""The innermost level of declared subdivision.

Whether this unit is independently decodable depends on the
full codec pipeline, not on the declared structure alone.
"""
return self.inner.innermost if self.inner is not None else self

@classmethod
def from_metadata(cls, metadata: ArrayV2Metadata | ArrayV3Metadata) -> ChunkLayout:
"""Derive a :class:`ChunkLayout` from array metadata.

Raises ``TypeError`` for a chunk grid kind this version cannot distill.
"""
# Imported lazily to avoid an import cycle with the metadata modules.
from zarr.core.metadata.v2 import ArrayV2Metadata
from zarr.core.metadata.v3 import (
RectilinearChunkGridMetadata,
RegularChunkGridMetadata,
)

if isinstance(metadata, ArrayV2Metadata):
return cls(chunks=tuple(metadata.chunks))

inner: ChunkLayout | None = None
if metadata.codecs:
inner = metadata.codecs[0].inner_chunk_layout()

grid = metadata.chunk_grid
if isinstance(grid, RegularChunkGridMetadata):
return cls(chunks=tuple(grid.chunk_shape), inner=inner)
if isinstance(grid, RectilinearChunkGridMetadata):
return cls(
chunks=tuple(tuple(s) if not isinstance(s, int) else s for s in grid.chunk_shapes),
inner=inner,
)
raise TypeError(f"Cannot derive a ChunkLayout from chunk grid {type(grid).__name__}")
10 changes: 5 additions & 5 deletions tests/test_chunk_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from tests.conftest import Expect, ExpectFail
from zarr.core.chunk_grids import (
ChunkLayout,
_guess_regular_chunks,
_ResolvedChunks,
normalize_chunks_1d,
normalize_chunks_nd,
resolve_outer_and_inner_chunks,
Expand Down Expand Up @@ -110,13 +110,13 @@ def test_resolve_outer_and_inner_chunks(


def test_chunk_layout_nested() -> None:
"""Test that ChunkLayout supports recursive nesting for nested sharding."""
"""Test that _ResolvedChunks supports recursive nesting for nested sharding."""
leaf = normalize_chunks_nd((5, 5), (100, 100))
mid = ChunkLayout(
mid = _ResolvedChunks(
outer_chunks=normalize_chunks_nd((25, 25), (100, 100)),
inner=ChunkLayout(outer_chunks=leaf),
inner=_ResolvedChunks(outer_chunks=leaf),
)
top = ChunkLayout(outer_chunks=normalize_chunks_nd((50, 50), (100, 100)), inner=mid)
top = _ResolvedChunks(outer_chunks=normalize_chunks_nd((50, 50), (100, 100)), inner=mid)

# Three levels: top -> mid -> leaf
_assert_chunks_equal(top.outer_chunks, ((50, 50), (50, 50)))
Expand Down
Loading
Loading