Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 116 additions & 88 deletions xrspatial/emerging_hotspots.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@
import xarray as xr
from numba import prange

from xrspatial.convolution import convolve_2d, _convolve_2d_numpy, _convolve_2d_cupy
from xrspatial.focal import _calc_hotspots_numpy
from xrspatial.convolution import convolve_2d
from xrspatial.focal import (
_calc_hotspots_numpy,
_gistar_global_stats,
_gistar_zscore,
)
from xrspatial.utils import (
ArrayTypeFunctionMapping,
_boundary_to_dask,
_validate_boundary,
ngjit,
not_implemented_func,
Expand Down Expand Up @@ -108,6 +111,51 @@ def _check_memory(n_times, ny, nx):
)


# ---------------------------------------------------------------------------
# Per-time-step Getis-Ord Gi* z-score
# ---------------------------------------------------------------------------

def _gistar_step_finalize(weighted_sum, weight_sum, sq_weight_sum,
global_mean, global_std, n):
"""Gi* z-score from convolution terms, NaN where the neighborhood is empty.

``_gistar_zscore`` maps an empty neighborhood (``weight_sum == 0``,
e.g. a cell whose own value and every neighbor are NaN) to a z-score
of 0. Emerging hot spot analysis needs those cells to stay NaN so the
Mann-Kendall step can detect pixels that have no data across the whole
time series instead of treating them as a constant cold series.
"""
z = _gistar_zscore(weighted_sum, weight_sum, sq_weight_sum,
global_mean, global_std, n)
return np.where(weight_sum > 0, z, np.float32(np.nan))


def _gistar_step_zscore(step_data, kernel, sq_kernel, global_mean,
global_std, n, convolve):
"""Getis-Ord Gi* z-score for a single 2-D time step.

``convolve`` is the backend convolution (numpy or cupy) applied with
edge padding already handled by the caller. The neighborhood weight
sum ``W_i`` and squared-weight sum ``W2_i`` come from convolving the
validity mask with the kernel and the squared kernel, so raster edges
and interior NaN cells drop out of the sums exactly as ``hotspots()``
does. ``global_mean``, ``global_std``, and ``n`` are the global terms
over the whole space-time cube.
"""
# Single-array backends pass a concrete numpy or cupy slice, so pick the
# matching module to build the validity mask. The dask helper below works
# off da.* instead, since its slices are lazy dask arrays of either chunk
# type and convolve_2d dispatches on the chunk type at compute time.
xp = cupy.get_array_module(step_data) if cupy is not None else np
valid = (~xp.isnan(step_data)).astype(step_data.dtype)
filled = xp.where(valid > 0, step_data, step_data.dtype.type(0.0))
weighted_sum = convolve(filled, kernel)
weight_sum = convolve(valid, kernel)
sq_weight_sum = convolve(valid, sq_kernel)
return _gistar_step_finalize(weighted_sum, weight_sum, sq_weight_sum,
global_mean, global_std, n)


# ---------------------------------------------------------------------------
# Mann-Kendall helpers (Numba-JIT for use inside pixel loops)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -395,24 +443,23 @@ def _emerging_hotspots_numpy(raster, kernel, boundary='nan'):
data = raster.data.astype(np.float32)
n_times, ny, nx = data.shape

# Global statistics across entire space-time cube
global_mean = np.nanmean(data)
global_std = np.nanstd(data)
if global_std == 0:
raise ZeroDivisionError(
"Standard deviation of the input raster values is 0."
)
# Global Gi* terms across the entire space-time cube.
global_mean = np.float32(np.nanmean(data))
global_std = np.float32(np.nanstd(data))
n = int((~np.isnan(data)).sum())
_gistar_global_stats(global_mean, global_std, n)

norm_kernel = (kernel / kernel.sum()).astype(np.float32)
kernel = kernel.astype(np.float32)
sq_kernel = kernel * kernel
convolve = partial(convolve_2d, boundary=boundary)

# Compute Gi* z-scores and confidence bins per time step
gi_zscore = np.empty((n_times, ny, nx), dtype=np.float32)
gi_bin = np.empty((n_times, ny, nx), dtype=np.int8)

for t in range(n_times):
step_data = data[t]
convolved = convolve_2d(step_data, norm_kernel, boundary)
z = (convolved - global_mean) / global_std
z = _gistar_step_zscore(data[t], kernel, sq_kernel, global_mean,
global_std, n, convolve)
gi_zscore[t] = z
gi_bin[t] = _calc_hotspots_numpy(z)

Expand All @@ -432,20 +479,20 @@ def _emerging_hotspots_cupy(raster, kernel, boundary='nan'):
data = raster.data.astype(cupy.float32)
n_times, ny, nx = data.shape

global_mean = cupy.nanmean(data)
global_std = cupy.nanstd(data)
if float(global_std) == 0:
raise ZeroDivisionError(
"Standard deviation of the input raster values is 0."
)
global_mean = np.float32(float(cupy.nanmean(data)))
global_std = np.float32(float(cupy.nanstd(data)))
n = int((~cupy.isnan(data)).sum())
_gistar_global_stats(global_mean, global_std, n)

norm_kernel = (kernel / kernel.sum()).astype(np.float32)
kernel = cupy.asarray(kernel).astype(cupy.float32)
sq_kernel = kernel * kernel
convolve = partial(convolve_2d, boundary=boundary)

# GPU: convolution + z-score per time step
# GPU: per-time-step Gi* z-score
gi_zscore_gpu = cupy.empty((n_times, ny, nx), dtype=cupy.float32)
for t in range(n_times):
convolved = convolve_2d(data[t], norm_kernel, boundary)
gi_zscore_gpu[t] = (convolved - global_mean) / global_std
gi_zscore_gpu[t] = _gistar_step_zscore(
data[t], kernel, sq_kernel, global_mean, global_std, n, convolve)

# Transfer to CPU for Mann-Kendall (branching-heavy, better on CPU)
gi_zscore_cpu = cupy.asnumpy(gi_zscore_gpu)
Expand All @@ -471,10 +518,23 @@ def _emerging_hotspots_cupy(raster, kernel, boundary='nan'):
# Dask + NumPy backend
# ---------------------------------------------------------------------------

def _convolve_and_zscore_chunk(chunk, kernel, global_mean, global_std):
"""Dask chunk function: convolve -> z-score."""
convolved = _convolve_2d_numpy(chunk, kernel)
return ((convolved - global_mean) / global_std).astype(np.float32)
def _gistar_step_dask(step_data, kernel, sq_kernel, global_mean,
global_std, n, boundary):
"""Lazy per-time-step Gi* z-score for a dask-backed 2-D slice.

Mirrors ``_hotspots_dask_numpy``: the convolutions go through the
dispatching ``convolve_2d`` so the dask boundary handling matches the
numpy/cupy single-array paths exactly. NaN cells are excluded from
every neighborhood sum via the validity mask.
"""
valid = ~da.isnan(step_data)
valid_f = valid.astype(step_data.dtype)
filled = da.where(valid, step_data, step_data.dtype.type(0.0))
weighted_sum = convolve_2d(filled, kernel, boundary)
weight_sum = convolve_2d(valid_f, kernel, boundary)
sq_weight_sum = convolve_2d(valid_f, sq_kernel, boundary)
return _gistar_step_finalize(weighted_sum, weight_sum, sq_weight_sum,
global_mean, global_std, n)


def _mk_classify_dask_chunk(block):
Expand Down Expand Up @@ -520,36 +580,23 @@ def _emerging_hotspots_dask_numpy(raster, kernel, boundary='nan'):

n_times = data.shape[0]

# Pass 1: eagerly compute global statistics (two scalars)
global_mean, global_std = da.compute(da.nanmean(data), da.nanstd(data))
# Pass 1: eagerly compute global Gi* terms over the whole cube.
global_mean, global_std, n = da.compute(
da.nanmean(data), da.nanstd(data), (~da.isnan(data)).sum())
global_mean = np.float32(global_mean)
global_std = np.float32(global_std)
n = int(n)
_gistar_global_stats(global_mean, global_std, n)

if global_std == 0:
raise ZeroDivisionError(
"Standard deviation of the input raster values is 0."
)
kernel = kernel.astype(np.float32)
sq_kernel = kernel * kernel

norm_kernel = (kernel / kernel.sum()).astype(np.float32)
pad_h = norm_kernel.shape[0] // 2
pad_w = norm_kernel.shape[1] // 2

# Pass 2: per time step, convolve + z-score via map_overlap, then stack
zscore_slices = []
for t in range(n_times):
_func = partial(
_convolve_and_zscore_chunk,
kernel=norm_kernel,
global_mean=global_mean,
global_std=global_std,
)
z_t = data[t].map_overlap(
_func,
depth=(pad_h, pad_w),
boundary=_boundary_to_dask(boundary),
meta=np.array((), dtype=np.float32),
)
zscore_slices.append(z_t)
# Pass 2: per time step, lazy Gi* z-score via convolve_2d, then stack.
zscore_slices = [
_gistar_step_dask(data[t], kernel, sq_kernel, global_mean,
global_std, n, boundary)
for t in range(n_times)
]

gi_zscore = da.stack(zscore_slices, axis=0)
# Rechunk so time axis is in a single chunk (time dim is small)
Expand Down Expand Up @@ -582,12 +629,6 @@ def _emerging_hotspots_dask_numpy(raster, kernel, boundary='nan'):
# Dask + CuPy backend (GPU convolution, CPU Mann-Kendall)
# ---------------------------------------------------------------------------

def _convolve_and_zscore_cupy_chunk(chunk, kernel, global_mean, global_std):
"""Dask chunk function: GPU convolve -> z-score."""
convolved = _convolve_2d_cupy(chunk, kernel)
return ((convolved - global_mean) / global_std).astype(cupy.float32)


def _gi_bin_cupy_chunk(block):
"""Dask chunk function: z-scores -> confidence bins (CPU classify, CuPy I/O)."""
block_cpu = cupy.asnumpy(block)
Expand Down Expand Up @@ -622,36 +663,23 @@ def _emerging_hotspots_dask_cupy(raster, kernel, boundary='nan'):

n_times = data.shape[0]

# Pass 1: eagerly compute global statistics (two scalars)
global_mean, global_std = da.compute(da.nanmean(data), da.nanstd(data))
# Pass 1: eagerly compute global Gi* terms over the whole cube.
global_mean, global_std, n = da.compute(
da.nanmean(data), da.nanstd(data), (~da.isnan(data)).sum())
global_mean = np.float32(float(global_mean))
global_std = np.float32(float(global_std))

if global_std == 0:
raise ZeroDivisionError(
"Standard deviation of the input raster values is 0."
)

norm_kernel = (kernel / kernel.sum()).astype(np.float32)
pad_h = norm_kernel.shape[0] // 2
pad_w = norm_kernel.shape[1] // 2

# Pass 2: per time step, GPU convolve + z-score via map_overlap, then stack
_func = partial(
_convolve_and_zscore_cupy_chunk,
kernel=norm_kernel,
global_mean=global_mean,
global_std=global_std,
)
zscore_slices = []
for t in range(n_times):
z_t = data[t].map_overlap(
_func,
depth=(pad_h, pad_w),
boundary=_boundary_to_dask(boundary, is_cupy=True),
meta=cupy.array((), dtype=cupy.float32),
)
zscore_slices.append(z_t)
n = int(n)
_gistar_global_stats(global_mean, global_std, n)

kernel = kernel.astype(np.float32)
sq_kernel = kernel * kernel

# Pass 2: per time step, lazy GPU Gi* z-score via convolve_2d, then stack.
zscore_slices = [
_gistar_step_dask(data[t], kernel, sq_kernel, global_mean,
global_std, n, boundary)
for t in range(n_times)
]

gi_zscore = da.stack(zscore_slices, axis=0)
gi_zscore = gi_zscore.rechunk({0: n_times})
Expand Down
94 changes: 94 additions & 0 deletions xrspatial/tests/test_emerging_hotspots.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,100 @@ def test_invalid_boundary(self):
emerging_hotspots(raster, _kernel_3x3(), boundary='invalid')


# ---------------------------------------------------------------------------
# Per-step Getis-Ord Gi* z-score (issue #2804)
# ---------------------------------------------------------------------------

def _gistar_reference(step, global_mean, global_std, n, kernel):
"""Hand-rolled per-step Gi* z-score using the focal Gi* helpers.

Mirrors what hotspots() computes, but with the global mean/std/n taken
over the whole space-time cube (the emerging_hotspots convention).
"""
from xrspatial.focal import _gistar_convolutions_numpy, _gistar_zscore

ws, wsum, sq = _gistar_convolutions_numpy(step, kernel, 'nan')
z = _gistar_zscore(ws, wsum, sq, global_mean, global_std, n)
return np.where(wsum > 0, z, np.float32(np.nan))


class TestGiStarStep:
"""The per-time-step z-score must be the real Gi* statistic, not the
old local-mean z-score (convolve with a normalized kernel then z-score
the local mean against the global mean/std)."""

def test_binary_kernel_matches_reference(self):
rng = np.random.default_rng(2804)
data = rng.standard_normal((4, 12, 12)).astype('f4')
kernel = np.ones((3, 3), dtype=np.float32)
ds = emerging_hotspots(_make_raster(data), kernel)

gm = np.float32(np.nanmean(data))
gs = np.float32(np.nanstd(data))
n = int((~np.isnan(data)).sum())
for t in range(data.shape[0]):
ref = _gistar_reference(data[t], gm, gs, n, kernel)
got = ds['gi_zscore'].values[t]
np.testing.assert_allclose(got, ref, atol=1e-5, equal_nan=True)

def test_weighted_kernel_matches_reference(self):
# A non-binary kernel makes W_i != W2_i, exercising the squared
# weight-sum term the old local-mean formula dropped entirely.
rng = np.random.default_rng(28041)
data = rng.standard_normal((3, 11, 11)).astype('f4')
kernel = np.array(
[[0.5, 1.0, 0.5],
[1.0, 2.0, 1.0],
[0.5, 1.0, 0.5]], dtype=np.float32,
)
ds = emerging_hotspots(_make_raster(data), kernel)

gm = np.float32(np.nanmean(data))
gs = np.float32(np.nanstd(data))
n = int((~np.isnan(data)).sum())
for t in range(data.shape[0]):
ref = _gistar_reference(data[t], gm, gs, n, kernel)
got = ds['gi_zscore'].values[t]
np.testing.assert_allclose(got, ref, atol=1e-5, equal_nan=True)

def test_differs_from_local_mean_zscore(self):
# Regression guard: the old code convolved with a normalized kernel
# and z-scored the local mean. That formula must NOT reproduce the
# current output for a weighted kernel.
rng = np.random.default_rng(28042)
data = rng.standard_normal((2, 10, 10)).astype('f4')
kernel = np.array(
[[0.0, 1.0, 0.0],
[1.0, 4.0, 1.0],
[0.0, 1.0, 0.0]], dtype=np.float32,
)
ds = emerging_hotspots(_make_raster(data), kernel)

from xrspatial.convolution import convolve_2d
gm = np.nanmean(data)
gs = np.nanstd(data)
norm_kernel = (kernel / kernel.sum()).astype(np.float32)
old = (convolve_2d(data[0], norm_kernel, 'nan') - gm) / gs

got = ds['gi_zscore'].values[0]
interior = (slice(1, -1), slice(1, -1))
assert not np.allclose(
got[interior], old[interior], atol=1e-3
), "per-step z-score still matches the old local-mean formula"

def test_interior_nan_excluded_from_neighborhood(self):
# An interior NaN must drop out of the Gi* neighborhood sums; the
# surrounding finite cells still get finite z-scores.
rng = np.random.default_rng(28043)
data = rng.standard_normal((2, 9, 9)).astype('f4')
data[:, 4, 4] = np.nan
ds = emerging_hotspots(_make_raster(data), _kernel_3x3())
z0 = ds['gi_zscore'].values[0]
# Neighbors of the NaN cell are interior and stay finite.
assert np.isfinite(z0[3, 4])
assert np.isfinite(z0[5, 4])


# ---------------------------------------------------------------------------
# Normal CDF approximation
# ---------------------------------------------------------------------------
Expand Down
Loading