From 5b745535caa8e8ab6f24167a72a2d0c2a01facc0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 8 Jun 2026 04:00:55 -0700 Subject: [PATCH 1/3] polygonize: guard conditional 8-conn diagonal in dask cross-chunk merge (#2677) --- xrspatial/polygonize.py | 139 +++++++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 24 deletions(-) diff --git a/xrspatial/polygonize.py b/xrspatial/polygonize.py index fcdaa3233..f5edbbe86 100644 --- a/xrspatial/polygonize.py +++ b/xrspatial/polygonize.py @@ -332,14 +332,18 @@ def _ranges_close(range_a, range_b, atol, rtol): return False -def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8): +def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8, + cell_field=None): """Direction-aware close-value test between two chunk-boundary polygons. - ``cells_a`` / ``cells_b`` map ``(global_col, global_row) -> value`` for - the pixels each polygon places on an internal chunk edge. Return True - if any pixel in ``cells_a`` is grid-adjacent to any pixel in ``cells_b`` - (orthogonally for 4-conn, plus diagonally for 8-conn) AND the pair - compares close under the *same orientation* numpy CCL uses. + ``cells_a`` / ``cells_b`` map ``(global_col, global_row) -> + (value, w_match, s_match)`` for the pixels each polygon places on an + internal chunk edge. ``w_match`` / ``s_match`` record whether numpy + CCL connected that pixel to its in-chunk W / S orthogonal neighbour + (same region). Return True if any pixel in ``cells_a`` is grid-adjacent + to any pixel in ``cells_b`` (orthogonally for 4-conn, plus diagonally + for 8-conn) AND the pair compares close under the *same orientation* + numpy CCL uses. numpy's ``_calculate_regions`` walks pixels in raster-scan order and tests ``_is_close(reference=current, value=earlier_neighbour)`` where @@ -351,29 +355,86 @@ def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8): ``ij = col + row * nx`` increases with row first, then column, so the higher-``ij`` pixel of an adjacent pair is the one with the greater row, or the greater column when rows are equal. + + 8-connectivity diagonals are *conditional* in numpy CCL: a pixel only + consults its SW diagonal when its W neighbour did not match, and its + SE diagonal only when its S neighbour did not match (see the + ``connectivity_8`` block in :func:`_calculate_regions`). A pairwise + cross-chunk test that always honoured the diagonal over-merged two + regions numpy keeps separate when the intervening orthogonal neighbour + already matched (#2677). The guard against that has two parts: + + * ``w_match`` / ``s_match`` on the higher-``ij`` pixel cover the case + where the orthogonal neighbour is *in the same chunk* (so numpy + already absorbed it into this pixel's region during the per-chunk + pass). + * ``cell_field`` -- a global ``(col, row) -> value`` map over every + boundary pixel -- covers the case where the orthogonal neighbour + lives in a *different chunk* and is itself a boundary pixel; the + diagonal is suppressed when that neighbour value is close to the + higher-``ij`` pixel under numpy's orientation. """ if not cells_a or not cells_b: return False - for (ca, ra), va in cells_a.items(): - for (cb, rb), vb in cells_b.items(): + for (ca, ra), (va, wa, sa) in cells_a.items(): + for (cb, rb), (vb, wb, sb) in cells_b.items(): dcol = cb - ca drow = rb - ra - adjacent = ( + orthogonal = ( (abs(dcol) == 1 and drow == 0) or - (abs(drow) == 1 and dcol == 0) or - (connectivity_8 and abs(dcol) == 1 and abs(drow) == 1)) - if not adjacent: + (abs(drow) == 1 and dcol == 0)) + diagonal = ( + connectivity_8 and abs(dcol) == 1 and abs(drow) == 1) + if not (orthogonal or diagonal): continue # Higher-ij pixel is the reference (greater row, then col). if (rb, cb) > (ra, ca): reference, value = vb, va + hi_col, hi_row = cb, rb + w_match, s_match = wb, sb + lo_col = ca else: reference, value = va, vb + hi_col, hi_row = ca, ra + w_match, s_match = wa, sa + lo_col = cb + if diagonal: + # numpy consults the SW diagonal only when W did not + # match, and the SE diagonal only when S did not match. + # The diagonal partner sits one row below the higher-ij + # pixel, so lo_col < hi_col is SW, lo_col > hi_col is SE. + if lo_col == hi_col - 1: + if w_match or _guard_matches( + cell_field, hi_col - 1, hi_row, + reference, atol, rtol): + continue + elif lo_col == hi_col + 1: + if s_match or _guard_matches( + cell_field, hi_col, hi_row - 1, + reference, atol, rtol): + continue if _values_close(reference, value, atol, rtol): return True return False +def _guard_matches(cell_field, gcol, grow, reference, atol, rtol): + """Whether the orthogonal guard pixel at ``(gcol, grow)`` matched. + + Looks the guard pixel up in the global boundary-value map and tests it + against the higher-``ij`` pixel under numpy's ``_is_close`` orientation + (reference scales the tolerance). Returns False when the guard is not + a boundary pixel -- the in-chunk ``w_match`` / ``s_match`` flags cover + that case, so a missing entry here must not suppress a real merge. + """ + if cell_field is None: + return False + guard = cell_field.get((gcol, grow)) + if guard is None: + return False + return _values_close(reference, guard, atol, rtol) + + def _group_boundary_polygons(boundary_polys, atol: float = _DEFAULT_ATOL, rtol: float = _DEFAULT_RTOL, @@ -489,6 +550,19 @@ def _group_boundary_polygons(boundary_polys, seen.add(key) adjacency_index.setdefault(key, []).append(idx) + # Global (col, row) -> value map across every boundary polygon's + # cells. The 8-connectivity diagonal guard in + # ``_cells_close_directed`` consults the orthogonal neighbour numpy + # would have checked first when that neighbour lives in another chunk + # (#2677). Cell values are (value, w_match, s_match) tuples. + cell_field = {} + if connectivity_8: + for item in boundary_polys: + cells = item[3] + if cells: + for coord, cval in cells.items(): + cell_field[coord] = cval[0] + # Union-find over polygon indices. parent = list(range(n)) @@ -523,7 +597,8 @@ def union(i, j): cells_j = boundary_polys[pj][3] if cells_i and cells_j: close = _cells_close_directed( - cells_i, cells_j, atol, rtol, connectivity_8) + cells_i, cells_j, atol, rtol, connectivity_8, + cell_field=cell_field) else: range_i = boundary_polys[pi][2] range_j = boundary_polys[pj][2] @@ -1205,12 +1280,13 @@ def _polygonize_chunk(block, mask_block, connectivity_8, row_offset, col_offset, ``(value, rings, (val_min, val_max), cells)`` tuple instead of just ``(value, rings)``. The ``(val_min, val_max)`` range supports the #2583 tolerance-chain union; ``cells`` maps - ``(global_col, global_row) -> value`` for the polygon's pixels on - internal chunk edges and lets the cross-chunk merge apply numpy - CCL's direction-aware ``_is_close`` orientation at the exact pixel - pair straddling each boundary (#2666). Integer rasters use strict - equality, so their ``cells`` map is empty and the merge falls back - to the range check. + ``(global_col, global_row) -> (value, w_match, s_match)`` for the + polygon's pixels on internal chunk edges and lets the cross-chunk + merge apply numpy CCL's direction-aware ``_is_close`` orientation at + the exact pixel pair straddling each boundary (#2666), with the + ``w_match`` / ``s_match`` flags guarding the conditional 8-conn + diagonal (#2677). Integer rasters use strict equality, so their + ``cells`` map is empty and the merge falls back to the range check. """ block = _to_numpy(block) if mask_block is not None: @@ -1287,12 +1363,16 @@ def _compute_region_value_ranges(block, mask_block, connectivity_8, * ``val_ranges[i]`` is the ``(val_min, val_max)`` span of region ``i`` (used by #2583 to carry the value extent into the cross-chunk merge). - * ``boundary_cells[i]`` is a ``{(global_col, global_row): value}`` dict - of the region's pixels that lie on an *internal* chunk edge (an edge - shared with a neighbour chunk, not a raster edge). The cross-chunk - merge (#2666) uses these per-cell values to replicate numpy CCL's + * ``boundary_cells[i]`` is a + ``{(global_col, global_row): (value, w_match, s_match)}`` dict of the + region's pixels that lie on an *internal* chunk edge (an edge shared + with a neighbour chunk, not a raster edge). The cross-chunk merge + (#2666) uses these per-cell values to replicate numpy CCL's direction-aware ``_is_close`` orientation at the actual pixel pair straddling each chunk boundary, instead of a symmetric range check. + ``w_match`` / ``s_match`` flag whether numpy CCL connected the pixel + to its in-chunk W / S orthogonal neighbour, which the 8-conn diagonal + guard (#2677) needs to know to suppress over-merging. ``row_offset`` / ``col_offset`` / ``ny_total`` / ``nx_total`` describe where this chunk sits in the global raster so internal edges can be @@ -1377,7 +1457,18 @@ def _compute_region_value_ranges(block, mask_block, connectivity_8, if cells is None: cells = {} boundary_cells[idx] = cells - cells[(local_col + col_offset, local_row + row_offset)] = v + # Record whether numpy CCL connected this pixel to its + # in-chunk W / S orthogonal neighbour (same region). The + # 8-conn diagonal guard (#2677) suppresses a cross-chunk + # diagonal merge when the orthogonal neighbour numpy + # checks first already matched. W is (local_row, + # local_col-1); S is (local_row-1, local_col). + w_match = ( + local_col > 0 and regions[ij - 1] == r) + s_match = ( + local_row > 0 and regions[ij - nx_eff] == r) + cells[(local_col + col_offset, local_row + row_offset)] = ( + v, w_match, s_match) out = [] cells_out = [] for i in range(n_regions): From f81472a81a99e75b3aaf04e52efd2cfb716cdc8a Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 8 Jun 2026 04:12:27 -0700 Subject: [PATCH 2/3] polygonize: replay numpy CCL conditional 8-conn diagonal in dask merge (#2677) Replace the per-pair 8-conn diagonal close test, which honoured every diagonal unconditionally and over-merged regions numpy keeps separate, with a pass that replays numpy's exact CCL predicate over the union of boundary cells. The conditional SW/SE diagonal uses the real orthogonal neighbour value when it is a boundary cell and a recorded in-chunk match flag otherwise, so a chunked 8-conn float raster reproduces the unchunked polygon partition. Add deterministic parity tests. --- xrspatial/polygonize.py | 226 ++++++++++-------- xrspatial/tests/test_polygonize_issue_2677.py | 187 +++++++++++++++ 2 files changed, 315 insertions(+), 98 deletions(-) create mode 100644 xrspatial/tests/test_polygonize_issue_2677.py diff --git a/xrspatial/polygonize.py b/xrspatial/polygonize.py index f5edbbe86..78c8c0ba9 100644 --- a/xrspatial/polygonize.py +++ b/xrspatial/polygonize.py @@ -332,107 +332,127 @@ def _ranges_close(range_a, range_b, atol, rtol): return False -def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8, - cell_field=None): +def _cells_close_directed(cells_a, cells_b, atol, rtol, connectivity_8): """Direction-aware close-value test between two chunk-boundary polygons. ``cells_a`` / ``cells_b`` map ``(global_col, global_row) -> (value, w_match, s_match)`` for the pixels each polygon places on an - internal chunk edge. ``w_match`` / ``s_match`` record whether numpy - CCL connected that pixel to its in-chunk W / S orthogonal neighbour - (same region). Return True if any pixel in ``cells_a`` is grid-adjacent - to any pixel in ``cells_b`` (orthogonally for 4-conn, plus diagonally - for 8-conn) AND the pair compares close under the *same orientation* - numpy CCL uses. + internal chunk edge. Return True if any pixel in ``cells_a`` is + orthogonally grid-adjacent to any pixel in ``cells_b`` AND the pair + compares close under the *same orientation* numpy CCL uses. numpy's ``_calculate_regions`` walks pixels in raster-scan order and tests ``_is_close(reference=current, value=earlier_neighbour)`` where - ``current`` is the higher-``ij`` pixel (East / North / the SE/NE - diagonal partner). Because ``rtol`` scales by ``abs(reference)``, the - test is asymmetric, so the cross-chunk merge must pick the higher-``ij`` - pixel as the reference to match numpy byte for byte (#2666). - - ``ij = col + row * nx`` increases with row first, then column, so the - higher-``ij`` pixel of an adjacent pair is the one with the greater row, - or the greater column when rows are equal. - - 8-connectivity diagonals are *conditional* in numpy CCL: a pixel only - consults its SW diagonal when its W neighbour did not match, and its - SE diagonal only when its S neighbour did not match (see the - ``connectivity_8`` block in :func:`_calculate_regions`). A pairwise - cross-chunk test that always honoured the diagonal over-merged two - regions numpy keeps separate when the intervening orthogonal neighbour - already matched (#2677). The guard against that has two parts: - - * ``w_match`` / ``s_match`` on the higher-``ij`` pixel cover the case - where the orthogonal neighbour is *in the same chunk* (so numpy - already absorbed it into this pixel's region during the per-chunk - pass). - * ``cell_field`` -- a global ``(col, row) -> value`` map over every - boundary pixel -- covers the case where the orthogonal neighbour - lives in a *different chunk* and is itself a boundary pixel; the - diagonal is suppressed when that neighbour value is close to the - higher-``ij`` pixel under numpy's orientation. + ``current`` is the higher-``ij`` pixel (East / North). Because + ``rtol`` scales by ``abs(reference)``, the test is asymmetric, so the + cross-chunk merge must pick the higher-``ij`` pixel as the reference + to match numpy byte for byte (#2666). + + This function handles the 4-connectivity case only. 8-connectivity + diagonals are conditional in numpy CCL (the SW / SE neighbour is + consulted only when the orthogonal W / S neighbour did not match), so + they cannot be reproduced by an independent pairwise test; the 8-conn + float merge instead replays numpy's full predicate over the union of + boundary cells in :func:`_boundary_ccl_unions` (#2677). """ if not cells_a or not cells_b: return False - for (ca, ra), (va, wa, sa) in cells_a.items(): - for (cb, rb), (vb, wb, sb) in cells_b.items(): + for (ca, ra), (va, _wa, _sa) in cells_a.items(): + for (cb, rb), (vb, _wb, _sb) in cells_b.items(): dcol = cb - ca drow = rb - ra orthogonal = ( (abs(dcol) == 1 and drow == 0) or (abs(drow) == 1 and dcol == 0)) - diagonal = ( - connectivity_8 and abs(dcol) == 1 and abs(drow) == 1) - if not (orthogonal or diagonal): + if not orthogonal: continue # Higher-ij pixel is the reference (greater row, then col). if (rb, cb) > (ra, ca): reference, value = vb, va - hi_col, hi_row = cb, rb - w_match, s_match = wb, sb - lo_col = ca else: reference, value = va, vb - hi_col, hi_row = ca, ra - w_match, s_match = wa, sa - lo_col = cb - if diagonal: - # numpy consults the SW diagonal only when W did not - # match, and the SE diagonal only when S did not match. - # The diagonal partner sits one row below the higher-ij - # pixel, so lo_col < hi_col is SW, lo_col > hi_col is SE. - if lo_col == hi_col - 1: - if w_match or _guard_matches( - cell_field, hi_col - 1, hi_row, - reference, atol, rtol): - continue - elif lo_col == hi_col + 1: - if s_match or _guard_matches( - cell_field, hi_col, hi_row - 1, - reference, atol, rtol): - continue if _values_close(reference, value, atol, rtol): return True return False -def _guard_matches(cell_field, gcol, grow, reference, atol, rtol): - """Whether the orthogonal guard pixel at ``(gcol, grow)`` matched. - - Looks the guard pixel up in the global boundary-value map and tests it - against the higher-``ij`` pixel under numpy's ``_is_close`` orientation - (reference scales the tolerance). Returns False when the guard is not - a boundary pixel -- the in-chunk ``w_match`` / ``s_match`` flags cover - that case, so a missing entry here must not suppress a real merge. +def _boundary_ccl_unions(boundary_polys, atol, rtol, union): + """Union 8-conn float boundary polygons via numpy's exact CCL predicate. + + A pairwise close test cannot reproduce numpy's *conditional* diagonal: + in ``_calculate_regions`` a pixel consults its SW diagonal only when + its W neighbour did not match, and its SE diagonal only when its S + neighbour did not match. Honouring every diagonal unconditionally + over-merged regions numpy keeps separate (#2677). + + This replays that predicate over the union of every boundary polygon's + edge cells. Each cell carries ``(value, w_match, s_match)`` where the + flags are numpy's direct W / S close test against the *in-chunk* + neighbour (see :func:`_compute_region_value_ranges`). Walking the + cells in raster-scan order, the orthogonal-match decision prefers the + actual neighbour value when that neighbour is itself a boundary cell + (cross-chunk case) and falls back to the recorded flag otherwise + (the neighbour is interior to the cell's own chunk). Whenever numpy's + predicate fires, the polygons owning the two cells are unioned, so the + transitive closure matches numpy's single-chunk labelling. """ - if cell_field is None: - return False - guard = cell_field.get((gcol, grow)) - if guard is None: - return False - return _values_close(reference, guard, atol, rtol) + # Global (col, row) -> value and -> set(owner polygon index) maps. + field = {} + owners = {} + for idx, item in enumerate(boundary_polys): + cells = item[3] + if not cells: + continue + for coord, (v, _w, _s) in cells.items(): + field[coord] = v + owners.setdefault(coord, set()).add(idx) + + def link(coord_a, coord_b): + for oi in owners.get(coord_a, ()): + for oj in owners.get(coord_b, ()): + if oi != oj: + union(oi, oj) + + # Raster-scan order: row first, then column, matching numpy's ij walk. + for (c, r) in sorted(field, key=lambda cr: (cr[1], cr[0])): + v = field[(c, r)] + flags = None # (w_match, s_match) from any owner cell. + for oi in owners[(c, r)]: + cell = boundary_polys[oi][3].get((c, r)) + if cell is not None: + flags = (cell[1], cell[2]) + break + w_flag, s_flag = flags if flags is not None else (False, False) + + # West neighbour. + w_val = field.get((c - 1, r)) + if w_val is not None: + matches_w = _values_close(v, w_val, atol, rtol) + if matches_w: + link((c, r), (c - 1, r)) + else: + # Interior W neighbour: numpy's match recorded as a flag. + matches_w = w_flag + + # South neighbour. + s_val = field.get((c, r - 1)) + if s_val is not None: + matches_s = _values_close(v, s_val, atol, rtol) + if matches_s: + link((c, r), (c, r - 1)) + else: + matches_s = s_flag + + # Conditional diagonals: SW only if W did not match, SE only if S + # did not match -- numpy's exact 8-connectivity shortcut. + if not matches_w: + sw_val = field.get((c - 1, r - 1)) + if sw_val is not None and _values_close(v, sw_val, atol, rtol): + link((c, r), (c - 1, r - 1)) + if not matches_s: + se_val = field.get((c + 1, r - 1)) + if se_val is not None and _values_close(v, se_val, atol, rtol): + link((c, r), (c + 1, r - 1)) def _group_boundary_polygons(boundary_polys, @@ -550,19 +570,6 @@ def _group_boundary_polygons(boundary_polys, seen.add(key) adjacency_index.setdefault(key, []).append(idx) - # Global (col, row) -> value map across every boundary polygon's - # cells. The 8-connectivity diagonal guard in - # ``_cells_close_directed`` consults the orthogonal neighbour numpy - # would have checked first when that neighbour lives in another chunk - # (#2677). Cell values are (value, w_match, s_match) tuples. - cell_field = {} - if connectivity_8: - for item in boundary_polys: - cells = item[3] - if cells: - for coord, cval in cells.items(): - cell_field[coord] = cval[0] - # Union-find over polygon indices. parent = list(range(n)) @@ -577,6 +584,16 @@ def union(i, j): if ri != rj: parent[max(ri, rj)] = min(ri, rj) + # 8-connectivity float merge: numpy's diagonal is *conditional* on the + # orthogonal neighbour, so a per-pair close test over-merges (#2677). + # Replicate numpy's exact CCL predicate over the union of boundary + # cells -- including the conditional SW / SE diagonal -- and union the + # owning polygons. 4-connectivity and integer rasters keep the + # ring-edge-keyed pairwise test below unchanged. + have_cells = any(item[3] for item in boundary_polys) + if connectivity_8 and have_cells: + _boundary_ccl_unions(boundary_polys, atol, rtol, union) + for poly_indices in adjacency_index.values(): if len(poly_indices) < 2: continue @@ -593,12 +610,16 @@ def union(i, j): # scan-order ``_is_close`` orientation exactly (#2666). # Integer polygons have no cell maps and fall back to the # range check, which reduces to strict equality for them. + # 8-conn float unions are decided by the CCL pass above, + # so skip the per-pair value test for that case (its + # conditional diagonal cannot be reproduced pairwise). cells_i = boundary_polys[pi][3] cells_j = boundary_polys[pj][3] if cells_i and cells_j: + if connectivity_8: + continue close = _cells_close_directed( - cells_i, cells_j, atol, rtol, connectivity_8, - cell_field=cell_field) + cells_i, cells_j, atol, rtol, connectivity_8) else: range_i = boundary_polys[pi][2] range_j = boundary_polys[pj][2] @@ -1457,18 +1478,27 @@ def _compute_region_value_ranges(block, mask_block, connectivity_8, if cells is None: cells = {} boundary_cells[idx] = cells - # Record whether numpy CCL connected this pixel to its - # in-chunk W / S orthogonal neighbour (same region). The - # 8-conn diagonal guard (#2677) suppresses a cross-chunk - # diagonal merge when the orthogonal neighbour numpy - # checks first already matched. W is (local_row, - # local_col-1); S is (local_row-1, local_col). + # Record whether numpy CCL's per-pixel close test matched + # this pixel against its in-chunk W / S orthogonal + # neighbour. This mirrors the ``matches_W`` / ``matches_S`` + # predicates in :func:`_calculate_regions` exactly -- it is + # the direct ``_is_close(cur, neighbour)`` test, NOT region + # co-membership (two pixels can share a region via a third + # path without being directly close). The 8-conn diagonal + # guard (#2677) consults the diagonal only when this + # orthogonal test failed, matching numpy's + # ``if not matches_W`` / ``if not matches_S`` shortcut. W + # is (local_row, local_col-1); S is (local_row-1, local_col). w_match = ( - local_col > 0 and regions[ij - 1] == r) + local_col > 0 and + (mask_flat is None or mask_flat[ij - 1]) and + _is_close(v, values_flat[ij - 1], atol, rtol)) s_match = ( - local_row > 0 and regions[ij - nx_eff] == r) + local_row > 0 and + (mask_flat is None or mask_flat[ij - nx_eff]) and + _is_close(v, values_flat[ij - nx_eff], atol, rtol)) cells[(local_col + col_offset, local_row + row_offset)] = ( - v, w_match, s_match) + v, bool(w_match), bool(s_match)) out = [] cells_out = [] for i in range(n_regions): diff --git a/xrspatial/tests/test_polygonize_issue_2677.py b/xrspatial/tests/test_polygonize_issue_2677.py new file mode 100644 index 000000000..0f600d73a --- /dev/null +++ b/xrspatial/tests/test_polygonize_issue_2677.py @@ -0,0 +1,187 @@ +"""Dask 8-connectivity float polygonize parity at large rtol (#2677). + +numpy connected-component labelling consults an 8-connected diagonal +neighbour *conditionally*: a pixel only looks at its SW diagonal when its +W neighbour did not match, and at its SE diagonal only when its S +neighbour did not match (see the ``connectivity_8`` block in +``_calculate_regions``). The dask cross-chunk merge in +``_cells_close_directed`` previously honoured every diagonal pair +unconditionally, so at a large ``rtol`` it could union two regions that +numpy keeps separate when the intervening orthogonal neighbour already +matched. The result was a chunked 8-conn float raster reporting fewer +polygons (and different DN values) than the unchunked input. + +This is distinct from the #2666 rtol scan-order orientation: it fails +across chunkings that do not split the diverging pixels, so it is the +diagonal-pairing decision, not the cross-chunk boundary close test, that +was wrong. The fix carries per-cell ``w_match`` / ``s_match`` flags plus +a global boundary-value map so the merge consults a diagonal only when +numpy would, for both in-chunk and cross-chunk orthogonal neighbours. + +These tests pin numpy/dask parity for the issue repro across every +chunking, and add a deterministic fuzz over random float rasters so the +8-conn merge stays chunk-invariant. +""" +import itertools + +import numpy as np +import pytest +import xarray as xr + +try: + import cupy +except ImportError: + cupy = None + +try: + import dask.array as da +except ImportError: + da = None + +from ..polygonize import polygonize +from .general_checks import cuda_and_cupy_available, dask_array_available + + +def _to_dask(arr, chunks): + return xr.DataArray(da.from_array(arr, chunks=chunks)) + + +def _to_dask_cupy(arr, chunks): + return xr.DataArray(da.from_array(cupy.asarray(arr), chunks=chunks)) + + +def _signature(values, geoms): + """Order-independent polygon signature: DN value plus exterior-ring + min corner and vertex count. Stronger than a bare count + DN multiset + because it also pins where each polygon sits and how big it is. + """ + sig = [] + for v, rings in zip(values, geoms): + ext = rings[0] + sig.append(( + round(float(v), 9), + round(float(ext[:, 0].min()), 6), + round(float(ext[:, 1].min()), 6), + len(ext), + )) + return sorted(sig) + + +def _all_chunkings(shape): + ny, nx = shape + return sorted(set(itertools.product(range(1, ny + 1), range(1, nx + 1)))) + + +# The exact repro from the issue: numpy reports 6 polygons (8-conn, +# atol=0, rtol=0.1); the unpatched dask merge reported fewer because it +# unioned the value-2.206 region into the value-2.098 region through a +# diagonal numpy never consults. +_REPRO = np.array( + [ + [2.098, 2.43, 2.206, 2.09], + [1.847, 2.292, 1.875, 2.784], + [2.927, 1.767, 2.583, 2.058], + [2.136, 2.851, 1.142, 1.174], + ], + dtype=np.float64, +) + + +@dask_array_available +class TestReproChunkInvariant: + """The issue repro must match the unchunked numpy partition for every + chunking, not just the count. + """ + + def test_numpy_reference_polygon_count(self): + v_np, _ = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + assert len(v_np) == 6 + + @pytest.mark.parametrize("chunks", _all_chunkings(_REPRO.shape)) + def test_dask_matches_numpy(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + v_dk, g_dk = polygonize( + _to_dask(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=8) + assert len(v_dk) == len(v_np), ( + f"count mismatch chunks={chunks}: " + f"numpy={len(v_np)} dask={len(v_dk)}" + ) + assert _signature(v_dk, g_dk) == _signature(v_np, g_np) + + +@dask_array_available +class TestEightConnFuzzParity: + """Random float rasters: every chunking of an 8-conn polygonize must + reproduce the unchunked numpy polygon partition. Deterministic via a + fixed seed -- no wall-clock assertions. + """ + + @pytest.mark.parametrize("rtol", [0.0, 0.05, 0.1, 0.2]) + def test_parity_8conn(self, rtol): + rng = np.random.default_rng(2677) + for _ in range(40): + ny = int(rng.integers(3, 6)) + nx = int(rng.integers(3, 6)) + arr = np.round(rng.uniform(1.0, 3.0, size=(ny, nx)), 3) + v_np, g_np = polygonize( + xr.DataArray(arr), atol=0.0, rtol=rtol, connectivity=8) + ref = _signature(v_np, g_np) + for chunks in _all_chunkings((ny, nx)): + v_dk, g_dk = polygonize( + _to_dask(arr, chunks=chunks), + atol=0.0, rtol=rtol, connectivity=8) + assert _signature(v_dk, g_dk) == ref, ( + f"diverge arr={arr.tolist()} rtol={rtol} " + f"chunks={chunks}" + ) + + +@dask_array_available +class TestFourConnUnaffected: + """4-connectivity already matched numpy after #2675; the diagonal + guard must leave it untouched. + """ + + @pytest.mark.parametrize("chunks", _all_chunkings(_REPRO.shape)) + def test_4conn_repro_unchanged(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=4) + v_dk, g_dk = polygonize( + _to_dask(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=4) + assert _signature(v_dk, g_dk) == _signature(v_np, g_np) + + +@dask_array_available +class TestIntegerEightConnUnaffected: + """Integer 8-conn rasters use strict equality; the float diagonal + guard must not change their chunk-invariant result. + """ + + def test_integer_chunk_invariance(self): + arr = np.array([[1, 2, 1], [2, 1, 2], [1, 2, 1]], dtype=np.int32) + v_np, _ = polygonize(xr.DataArray(arr), connectivity=8) + for chunks in _all_chunkings(arr.shape): + v_dk, _ = polygonize( + _to_dask(arr, chunks=chunks), connectivity=8) + assert sorted(v_dk) == sorted(v_np) + + +@cuda_and_cupy_available +@dask_array_available +class TestReproDaskCuPy: + """dask+cupy shares the cross-chunk merge path, so the repro must + match numpy there too. + """ + + @pytest.mark.parametrize("chunks", [(2, 2), (1, 4), (4, 1)]) + def test_repro_dask_cupy(self, chunks): + v_np, g_np = polygonize( + xr.DataArray(_REPRO), atol=0.0, rtol=0.1, connectivity=8) + v_dc, g_dc = polygonize( + _to_dask_cupy(_REPRO, chunks=chunks), + atol=0.0, rtol=0.1, connectivity=8) + assert _signature(v_dc, g_dc) == _signature(v_np, g_np) From 780bdc67c821c47421123d306ab484b5aeb8c369 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 8 Jun 2026 04:15:24 -0700 Subject: [PATCH 3/3] Address review: document single-pixel invariant, pin numpy ref counts (#2677) --- xrspatial/polygonize.py | 3 +++ xrspatial/tests/test_polygonize_issue_2677.py | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/xrspatial/polygonize.py b/xrspatial/polygonize.py index 78c8c0ba9..05b01f179 100644 --- a/xrspatial/polygonize.py +++ b/xrspatial/polygonize.py @@ -397,6 +397,9 @@ def _boundary_ccl_unions(boundary_polys, atol, rtol, union): transitive closure matches numpy's single-chunk labelling. """ # Global (col, row) -> value and -> set(owner polygon index) maps. + # A coordinate is one physical pixel, so every owner sharing it + # carries the same value and the same W / S match flags; overwriting + # ``field`` and reading the flags from any one owner is therefore safe. field = {} owners = {} for idx, item in enumerate(boundary_polys): diff --git a/xrspatial/tests/test_polygonize_issue_2677.py b/xrspatial/tests/test_polygonize_issue_2677.py index 0f600d73a..4b706bf34 100644 --- a/xrspatial/tests/test_polygonize_issue_2677.py +++ b/xrspatial/tests/test_polygonize_issue_2677.py @@ -119,15 +119,22 @@ class TestEightConnFuzzParity: fixed seed -- no wall-clock assertions. """ + # Total numpy polygon count over the fixed-seed sweep, recorded per + # rtol. Pinning it means a numpy-side CCL change is caught here + # directly, rather than silently shifting the dask parity reference. + _EXPECTED_NUMPY_TOTAL = {0.0: 639, 0.05: 492, 0.1: 352, 0.2: 200} + @pytest.mark.parametrize("rtol", [0.0, 0.05, 0.1, 0.2]) def test_parity_8conn(self, rtol): rng = np.random.default_rng(2677) + numpy_total = 0 for _ in range(40): ny = int(rng.integers(3, 6)) nx = int(rng.integers(3, 6)) arr = np.round(rng.uniform(1.0, 3.0, size=(ny, nx)), 3) v_np, g_np = polygonize( xr.DataArray(arr), atol=0.0, rtol=rtol, connectivity=8) + numpy_total += len(v_np) ref = _signature(v_np, g_np) for chunks in _all_chunkings((ny, nx)): v_dk, g_dk = polygonize( @@ -137,6 +144,11 @@ def test_parity_8conn(self, rtol): f"diverge arr={arr.tolist()} rtol={rtol} " f"chunks={chunks}" ) + expected = self._EXPECTED_NUMPY_TOTAL[rtol] + assert expected is None or numpy_total == expected, ( + f"numpy 8-conn reference count changed for rtol={rtol}: " + f"got {numpy_total}, expected {expected}" + ) @dask_array_available