diff --git a/xrspatial/polygonize.py b/xrspatial/polygonize.py index fcdaa323..05b01f17 100644 --- a/xrspatial/polygonize.py +++ b/xrspatial/polygonize.py @@ -335,34 +335,36 @@ def _ranges_close(range_a, range_b, atol, rtol): def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8): """Direction-aware close-value test between two chunk-boundary polygons. - ``cells_a`` / ``cells_b`` map ``(global_col, global_row) -> value`` for - the pixels each polygon places on an internal chunk edge. Return True - if any pixel in ``cells_a`` is grid-adjacent to any pixel in ``cells_b`` - (orthogonally for 4-conn, plus diagonally for 8-conn) AND the pair + ``cells_a`` / ``cells_b`` map ``(global_col, global_row) -> + (value, w_match, s_match)`` for the pixels each polygon places on an + internal chunk edge. Return True if any pixel in ``cells_a`` is + orthogonally grid-adjacent to any pixel in ``cells_b`` AND the pair compares close under the *same orientation* numpy CCL uses. numpy's ``_calculate_regions`` walks pixels in raster-scan order and tests ``_is_close(reference=current, value=earlier_neighbour)`` where - ``current`` is the higher-``ij`` pixel (East / North / the SE/NE - diagonal partner). Because ``rtol`` scales by ``abs(reference)``, the - test is asymmetric, so the cross-chunk merge must pick the higher-``ij`` - pixel as the reference to match numpy byte for byte (#2666). - - ``ij = col + row * nx`` increases with row first, then column, so the - higher-``ij`` pixel of an adjacent pair is the one with the greater row, - or the greater column when rows are equal. + ``current`` is the higher-``ij`` pixel (East / North). Because + ``rtol`` scales by ``abs(reference)``, the test is asymmetric, so the + cross-chunk merge must pick the higher-``ij`` pixel as the reference + to match numpy byte for byte (#2666). + + This function handles the 4-connectivity case only. 8-connectivity + diagonals are conditional in numpy CCL (the SW / SE neighbour is + consulted only when the orthogonal W / S neighbour did not match), so + they cannot be reproduced by an independent pairwise test; the 8-conn + float merge instead replays numpy's full predicate over the union of + boundary cells in :func:`_boundary_ccl_unions` (#2677). """ if not cells_a or not cells_b: return False - for (ca, ra), va in cells_a.items(): - for (cb, rb), vb in cells_b.items(): + for (ca, ra), (va, _wa, _sa) in cells_a.items(): + for (cb, rb), (vb, _wb, _sb) in cells_b.items(): dcol = cb - ca drow = rb - ra - adjacent = ( + orthogonal = ( (abs(dcol) == 1 and drow == 0) or - (abs(drow) == 1 and dcol == 0) or - (connectivity_8 and abs(dcol) == 1 and abs(drow) == 1)) - if not adjacent: + (abs(drow) == 1 and dcol == 0)) + if not orthogonal: continue # Higher-ij pixel is the reference (greater row, then col). if (rb, cb) > (ra, ca): @@ -374,6 +376,88 @@ def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8): return False +def _boundary_ccl_unions(boundary_polys, atol, rtol, union): + """Union 8-conn float boundary polygons via numpy's exact CCL predicate. + + A pairwise close test cannot reproduce numpy's *conditional* diagonal: + in ``_calculate_regions`` a pixel consults its SW diagonal only when + its W neighbour did not match, and its SE diagonal only when its S + neighbour did not match. Honouring every diagonal unconditionally + over-merged regions numpy keeps separate (#2677). + + This replays that predicate over the union of every boundary polygon's + edge cells. Each cell carries ``(value, w_match, s_match)`` where the + flags are numpy's direct W / S close test against the *in-chunk* + neighbour (see :func:`_compute_region_value_ranges`). Walking the + cells in raster-scan order, the orthogonal-match decision prefers the + actual neighbour value when that neighbour is itself a boundary cell + (cross-chunk case) and falls back to the recorded flag otherwise + (the neighbour is interior to the cell's own chunk). Whenever numpy's + predicate fires, the polygons owning the two cells are unioned, so the + transitive closure matches numpy's single-chunk labelling. + """ + # Global (col, row) -> value and -> set(owner polygon index) maps. + # A coordinate is one physical pixel, so every owner sharing it + # carries the same value and the same W / S match flags; overwriting + # ``field`` and reading the flags from any one owner is therefore safe. + field = {} + owners = {} + for idx, item in enumerate(boundary_polys): + cells = item[3] + if not cells: + continue + for coord, (v, _w, _s) in cells.items(): + field[coord] = v + owners.setdefault(coord, set()).add(idx) + + def link(coord_a, coord_b): + for oi in owners.get(coord_a, ()): + for oj in owners.get(coord_b, ()): + if oi != oj: + union(oi, oj) + + # Raster-scan order: row first, then column, matching numpy's ij walk. + for (c, r) in sorted(field, key=lambda cr: (cr[1], cr[0])): + v = field[(c, r)] + flags = None # (w_match, s_match) from any owner cell. + for oi in owners[(c, r)]: + cell = boundary_polys[oi][3].get((c, r)) + if cell is not None: + flags = (cell[1], cell[2]) + break + w_flag, s_flag = flags if flags is not None else (False, False) + + # West neighbour. + w_val = field.get((c - 1, r)) + if w_val is not None: + matches_w = _values_close(v, w_val, atol, rtol) + if matches_w: + link((c, r), (c - 1, r)) + else: + # Interior W neighbour: numpy's match recorded as a flag. + matches_w = w_flag + + # South neighbour. + s_val = field.get((c, r - 1)) + if s_val is not None: + matches_s = _values_close(v, s_val, atol, rtol) + if matches_s: + link((c, r), (c, r - 1)) + else: + matches_s = s_flag + + # Conditional diagonals: SW only if W did not match, SE only if S + # did not match -- numpy's exact 8-connectivity shortcut. + if not matches_w: + sw_val = field.get((c - 1, r - 1)) + if sw_val is not None and _values_close(v, sw_val, atol, rtol): + link((c, r), (c - 1, r - 1)) + if not matches_s: + se_val = field.get((c + 1, r - 1)) + if se_val is not None and _values_close(v, se_val, atol, rtol): + link((c, r), (c + 1, r - 1)) + + def _group_boundary_polygons(boundary_polys, atol: float = _DEFAULT_ATOL, rtol: float = _DEFAULT_RTOL, @@ -503,6 +587,16 @@ def union(i, j): if ri != rj: parent[max(ri, rj)] = min(ri, rj) + # 8-connectivity float merge: numpy's diagonal is *conditional* on the + # orthogonal neighbour, so a per-pair close test over-merges (#2677). + # Replicate numpy's exact CCL predicate over the union of boundary + # cells -- including the conditional SW / SE diagonal -- and union the + # owning polygons. 4-connectivity and integer rasters keep the + # ring-edge-keyed pairwise test below unchanged. + have_cells = any(item[3] for item in boundary_polys) + if connectivity_8 and have_cells: + _boundary_ccl_unions(boundary_polys, atol, rtol, union) + for poly_indices in adjacency_index.values(): if len(poly_indices) < 2: continue @@ -519,9 +613,14 @@ def union(i, j): # scan-order ``_is_close`` orientation exactly (#2666). # Integer polygons have no cell maps and fall back to the # range check, which reduces to strict equality for them. + # 8-conn float unions are decided by the CCL pass above, + # so skip the per-pair value test for that case (its + # conditional diagonal cannot be reproduced pairwise). cells_i = boundary_polys[pi][3] cells_j = boundary_polys[pj][3] if cells_i and cells_j: + if connectivity_8: + continue close = _cells_close_directed( cells_i, cells_j, atol, rtol, connectivity_8) else: @@ -1205,12 +1304,13 @@ def _polygonize_chunk(block, mask_block, connectivity_8, row_offset, col_offset, ``(value, rings, (val_min, val_max), cells)`` tuple instead of just ``(value, rings)``. The ``(val_min, val_max)`` range supports the #2583 tolerance-chain union; ``cells`` maps - ``(global_col, global_row) -> value`` for the polygon's pixels on - internal chunk edges and lets the cross-chunk merge apply numpy - CCL's direction-aware ``_is_close`` orientation at the exact pixel - pair straddling each boundary (#2666). Integer rasters use strict - equality, so their ``cells`` map is empty and the merge falls back - to the range check. + ``(global_col, global_row) -> (value, w_match, s_match)`` for the + polygon's pixels on internal chunk edges and lets the cross-chunk + merge apply numpy CCL's direction-aware ``_is_close`` orientation at + the exact pixel pair straddling each boundary (#2666), with the + ``w_match`` / ``s_match`` flags guarding the conditional 8-conn + diagonal (#2677). Integer rasters use strict equality, so their + ``cells`` map is empty and the merge falls back to the range check. """ block = _to_numpy(block) if mask_block is not None: @@ -1287,12 +1387,16 @@ def _compute_region_value_ranges(block, mask_block, connectivity_8, * ``val_ranges[i]`` is the ``(val_min, val_max)`` span of region ``i`` (used by #2583 to carry the value extent into the cross-chunk merge). - * ``boundary_cells[i]`` is a ``{(global_col, global_row): value}`` dict - of the region's pixels that lie on an *internal* chunk edge (an edge - shared with a neighbour chunk, not a raster edge). The cross-chunk - merge (#2666) uses these per-cell values to replicate numpy CCL's + * ``boundary_cells[i]`` is a + ``{(global_col, global_row): (value, w_match, s_match)}`` dict of the + region's pixels that lie on an *internal* chunk edge (an edge shared + with a neighbour chunk, not a raster edge). The cross-chunk merge + (#2666) uses these per-cell values to replicate numpy CCL's direction-aware ``_is_close`` orientation at the actual pixel pair straddling each chunk boundary, instead of a symmetric range check. + ``w_match`` / ``s_match`` flag whether numpy CCL connected the pixel + to its in-chunk W / S orthogonal neighbour, which the 8-conn diagonal + guard (#2677) needs to know to suppress over-merging. ``row_offset`` / ``col_offset`` / ``ny_total`` / ``nx_total`` describe where this chunk sits in the global raster so internal edges can be @@ -1377,7 +1481,27 @@ def _compute_region_value_ranges(block, mask_block, connectivity_8, if cells is None: cells = {} boundary_cells[idx] = cells - cells[(local_col + col_offset, local_row + row_offset)] = v + # Record whether numpy CCL's per-pixel close test matched + # this pixel against its in-chunk W / S orthogonal + # neighbour. This mirrors the ``matches_W`` / ``matches_S`` + # predicates in :func:`_calculate_regions` exactly -- it is + # the direct ``_is_close(cur, neighbour)`` test, NOT region + # co-membership (two pixels can share a region via a third + # path without being directly close). The 8-conn diagonal + # guard (#2677) consults the diagonal only when this + # orthogonal test failed, matching numpy's + # ``if not matches_W`` / ``if not matches_S`` shortcut. W + # is (local_row, local_col-1); S is (local_row-1, local_col). + w_match = ( + local_col > 0 and + (mask_flat is None or mask_flat[ij - 1]) and + _is_close(v, values_flat[ij - 1], atol, rtol)) + s_match = ( + local_row > 0 and + (mask_flat is None or mask_flat[ij - nx_eff]) and + _is_close(v, values_flat[ij - nx_eff], atol, rtol)) + cells[(local_col + col_offset, local_row + row_offset)] = ( + v, bool(w_match), bool(s_match)) out = [] cells_out = [] for i in range(n_regions): diff --git a/xrspatial/tests/test_polygonize_issue_2677.py b/xrspatial/tests/test_polygonize_issue_2677.py new file mode 100644 index 00000000..4b706bf3 --- /dev/null +++ b/xrspatial/tests/test_polygonize_issue_2677.py @@ -0,0 +1,199 @@ +"""Dask 8-connectivity float polygonize parity at large rtol (#2677). + +numpy connected-component labelling consults an 8-connected diagonal +neighbour *conditionally*: a pixel only looks at its SW diagonal when its +W neighbour did not match, and at its SE diagonal only when its S +neighbour did not match (see the ``connectivity_8`` block in +``_calculate_regions``). The dask cross-chunk merge in +``_cells_close_directed`` previously honoured every diagonal pair +unconditionally, so at a large ``rtol`` it could union two regions that +numpy keeps separate when the intervening orthogonal neighbour already +matched. The result was a chunked 8-conn float raster reporting fewer +polygons (and different DN values) than the unchunked input. + +This is distinct from the #2666 rtol scan-order orientation: it fails +across chunkings that do not split the diverging pixels, so it is the +diagonal-pairing decision, not the cross-chunk boundary close test, that +was wrong. The fix carries per-cell ``w_match`` / ``s_match`` flags plus +a global boundary-value map so the merge consults a diagonal only when +numpy would, for both in-chunk and cross-chunk orthogonal neighbours. + +These tests pin numpy/dask parity for the issue repro across every +chunking, and add a deterministic fuzz over random float rasters so the +8-conn merge stays chunk-invariant. +""" +import itertools + +import numpy as np +import pytest +import xarray as xr + +try: + import cupy +except ImportError: + cupy = None + +try: + import dask.array as da +except ImportError: + da = None + +from ..polygonize import polygonize +from .general_checks import cuda_and_cupy_available, dask_array_available + + +def _to_dask(arr, chunks): + return xr.DataArray(da.from_array(arr, chunks=chunks)) + + +def _to_dask_cupy(arr, chunks): + return xr.DataArray(da.from_array(cupy.asarray(arr), chunks=chunks)) + + +def _signature(values, geoms): + """Order-independent polygon signature: DN value plus exterior-ring + min corner and vertex count. Stronger than a bare count + DN multiset + because it also pins where each polygon sits and how big it is. + """ + sig = [] + for v, rings in zip(values, geoms): + ext = rings[0] + sig.append(( + round(float(v), 9), + round(float(ext[:, 0].min()), 6), + round(float(ext[:, 1].min()), 6), + len(ext), + )) + return sorted(sig) + + +def _all_chunkings(shape): + ny, nx = shape + return sorted(set(itertools.product(range(1, ny + 1), range(1, nx + 1)))) + + +# The exact repro from the issue: numpy reports 6 polygons (8-conn, +# atol=0, rtol=0.1); the unpatched dask merge reported fewer because it +# unioned the value-2.206 region into the value-2.098 region through a +# diagonal numpy never consults. +_REPRO = np.array( + [ + [2.098, 2.43, 2.206, 2.09], + [1.847, 2.292, 1.875, 2.784], + [2.927, 1.767, 2.583, 2.058], + [2.136, 2.851, 1.142, 1.174], + ], + dtype=np.float64, +) + + +@dask_array_available +class TestReproChunkInvariant: + """The issue repro must match the unchunked numpy partition for every + chunking, not just the count. + """ + + def test_numpy_reference_polygon_count(self): + v_np, _ = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + assert len(v_np) == 6 + + @pytest.mark.parametrize("chunks", _all_chunkings(_REPRO.shape)) + def test_dask_matches_numpy(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + v_dk, g_dk = polygonize( + _to_dask(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=8) + assert len(v_dk) == len(v_np), ( + f"count mismatch chunks={chunks}: " + f"numpy={len(v_np)} dask={len(v_dk)}" + ) + assert _signature(v_dk, g_dk) == _signature(v_np, g_np) + + +@dask_array_available +class TestEightConnFuzzParity: + """Random float rasters: every chunking of an 8-conn polygonize must + reproduce the unchunked numpy polygon partition. Deterministic via a + fixed seed -- no wall-clock assertions. + """ + + # Total numpy polygon count over the fixed-seed sweep, recorded per + # rtol. Pinning it means a numpy-side CCL change is caught here + # directly, rather than silently shifting the dask parity reference. + _EXPECTED_NUMPY_TOTAL = {0.0: 639, 0.05: 492, 0.1: 352, 0.2: 200} + + @pytest.mark.parametrize("rtol", [0.0, 0.05, 0.1, 0.2]) + def test_parity_8conn(self, rtol): + rng = np.random.default_rng(2677) + numpy_total = 0 + for _ in range(40): + ny = int(rng.integers(3, 6)) + nx = int(rng.integers(3, 6)) + arr = np.round(rng.uniform(1.0, 3.0, size=(ny, nx)), 3) + v_np, g_np = polygonize( + xr.DataArray(arr), atol=0.0, rtol=rtol, connectivity=8) + numpy_total += len(v_np) + ref = _signature(v_np, g_np) + for chunks in _all_chunkings((ny, nx)): + v_dk, g_dk = polygonize( + _to_dask(arr, chunks=chunks), + atol=0.0, rtol=rtol, connectivity=8) + assert _signature(v_dk, g_dk) == ref, ( + f"diverge arr={arr.tolist()} rtol={rtol} " + f"chunks={chunks}" + ) + expected = self._EXPECTED_NUMPY_TOTAL[rtol] + assert expected is None or numpy_total == expected, ( + f"numpy 8-conn reference count changed for rtol={rtol}: " + f"got {numpy_total}, expected {expected}" + ) + + +@dask_array_available +class TestFourConnUnaffected: + """4-connectivity already matched numpy after #2675; the diagonal + guard must leave it untouched. + """ + + @pytest.mark.parametrize("chunks", _all_chunkings(_REPRO.shape)) + def test_4conn_repro_unchanged(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=4) + v_dk, g_dk = polygonize( + _to_dask(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=4) + assert _signature(v_dk, g_dk) == _signature(v_np, g_np) + + +@dask_array_available +class TestIntegerEightConnUnaffected: + """Integer 8-conn rasters use strict equality; the float diagonal + guard must not change their chunk-invariant result. + """ + + def test_integer_chunk_invariance(self): + arr = np.array([[1, 2, 1], [2, 1, 2], [1, 2, 1]], dtype=np.int32) + v_np, _ = polygonize(xr.DataArray(arr), connectivity=8) + for chunks in _all_chunkings(arr.shape): + v_dk, _ = polygonize( + _to_dask(arr, chunks=chunks), connectivity=8) + assert sorted(v_dk) == sorted(v_np) + + +@cuda_and_cupy_available +@dask_array_available +class TestReproDaskCuPy: + """dask+cupy shares the cross-chunk merge path, so the repro must + match numpy there too. + """ + + @pytest.mark.parametrize("chunks", [(2, 2), (1, 4), (4, 1)]) + def test_repro_dask_cupy(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + v_dc, g_dc = polygonize( + _to_dask_cupy(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=8) + assert _signature(v_dc, g_dc) == _signature(v_np, g_np)