From a5817f90b9fbecb00f08d668b29633dcabe8cf84 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Thu, 4 Jun 2026 10:58:31 +0000 Subject: [PATCH] test(e2e): disable telemetry process-wide to deflake retry tests Follow-up to #822, which was insufficient. The retry tests in PySQLRetryTestsMixin patch urllib3 globally (HTTPSConnectionPool._get_conn / _validate_conn) and assert the connection's own request was retried exactly N times. #822 disabled telemetry on the retry tests' OWN connections, but the flake persisted: under pytest-xdist, MANY other e2e tests run in the SAME worker process before the retry tests, and each opens a connection with telemetry enabled by default. Telemetry is a process-global singleton (TelemetryClientFactory) with a shared executor and a daemon flush thread, so those prior tests leave real TelemetryClient instances and background threads/futures registered in the worker. When a retry test installs its global urllib3 mock, an in-flight or subsequently-fired telemetry POST to /telemetry-ext from a *previous* test's client goes through the same mocked pool and inflates the mock call_count past the expected value -> flaky AssertionError. CI logs on #825 show /telemetry-ext retries firing inside the failing retry test even though that test's own connection had telemetry disabled. Fix: add an autouse, function-scoped fixture in a new tests/e2e/conftest.py that force-installs NoopTelemetryClient for every e2e connection (and drains the factory before/after each test). With no real TelemetryClient ever created in the worker, there are no background telemetry threads or futures to leak into any test's urllib3 mock. This removes the flake at its source deterministically rather than draining racy background threads. The telemetry suite (tests/e2e/test_telemetry_e2e.py) asserts real telemetry behavior and is marked @pytest.mark.serial while managing its own factory state, so the fixture explicitly skips serial-marked tests. Verified against dogfood: - A normal connection opened with telemetry defaulting ON gets a NoopTelemetryClient (no real executor/flush thread). - serial-marked tests keep the real initialize_telemetry_client; non-serial tests get the no-op installer. - test_telemetry_e2e.py still passes (12 tests, real telemetry). - Full PySQLRetryTestsMixin passes (36 tests). Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- tests/e2e/conftest.py | 107 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/e2e/conftest.py diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 000000000..176d027a9 --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,107 @@ +"""Shared e2e test fixtures. + +The most important fixture here is ``_disable_telemetry_for_e2e``, which +eliminates a long-standing source of flakiness in the retry tests. + +Background +---------- +The retry tests in ``tests/e2e/common/retry_test_mixins.py`` patch urllib3 +*globally* (``HTTPSConnectionPool._get_conn`` / ``._validate_conn``) and assert +that the connection's own request was retried exactly N times (e.g. +``assert call_count == 6``). The connector's telemetry feature is a +process-global singleton (``TelemetryClientFactory``) with a shared +``ThreadPoolExecutor`` and a daemon periodic-flush thread. Every ordinary e2e +test (``test_dates``, ``test_decimals``, ...) opens a connection with telemetry +enabled by default, which leaves a real ``TelemetryClient`` plus background +threads/futures registered in that global factory. + +Under pytest-xdist, those ordinary tests run in the SAME worker process as the +retry tests. When a retry test installs its global urllib3 mock, an in-flight +or subsequently-fired telemetry POST to ``/telemetry-ext`` from a *previous* +test's lingering client goes through the same mocked pool and inflates the +mock's ``call_count`` past the expected value -> flaky ``AssertionError``. CI +logs confirm ``/telemetry-ext`` retries appearing *inside* the failing retry +test even after that test's own connection disabled telemetry. + +Fix +--- +Disable telemetry for *every* e2e connection (not just the retry tests) by +force-installing ``NoopTelemetryClient`` for the duration of each test. With no +real ``TelemetryClient`` ever created in the worker, there are no background +telemetry threads/futures to leak into any test's urllib3 mock. This removes +the flake at its source deterministically, rather than trying to drain racy +background threads. + +The telemetry suite (``tests/e2e/test_telemetry_e2e.py``) is the one place that +asserts real telemetry behavior; it is marked ``@pytest.mark.serial`` and +manages its own ``TelemetryClientFactory`` state, so this fixture explicitly +skips ``serial``-marked tests. +""" + +import pytest + +from databricks.sql.telemetry.telemetry_client import ( + NoopTelemetryClient, + TelemetryClientFactory, + _TelemetryClientHolder, +) + + +def _drain_telemetry_factory(): + """Hard-reset the process-global telemetry factory: close clients, stop the + flush thread, shut the executor down, and clear all state.""" + with TelemetryClientFactory._lock: + for holder in list(TelemetryClientFactory._clients.values()): + try: + holder.client.close() + except Exception: + pass + try: + TelemetryClientFactory._stop_flush_thread() + except Exception: + pass + if TelemetryClientFactory._executor is not None: + try: + TelemetryClientFactory._executor.shutdown(wait=True) + except Exception: + pass + TelemetryClientFactory._clients = {} + TelemetryClientFactory._executor = None + TelemetryClientFactory._initialized = False + + +@pytest.fixture(autouse=True) +def _disable_telemetry_for_e2e(request, monkeypatch): + """Force every e2e connection to use NoopTelemetryClient so no real + telemetry background work runs in the worker process. + + Skipped for ``serial``-marked tests (the telemetry suite), which assert + real telemetry behavior and manage their own factory state. + """ + if request.node.get_closest_marker("serial") is not None: + # Let the telemetry suite exercise real telemetry. + yield + return + + # Clear any real telemetry state left by an earlier test in this worker, + # then make all subsequent telemetry initialization a no-op. + _drain_telemetry_factory() + + def _install_noop(*args, host_url=None, **kwargs): + host_url = TelemetryClientFactory.getHostUrlSafely(host_url) + with TelemetryClientFactory._lock: + TelemetryClientFactory._clients[host_url] = _TelemetryClientHolder( + NoopTelemetryClient() + ) + + monkeypatch.setattr( + TelemetryClientFactory, + "initialize_telemetry_client", + staticmethod(_install_noop), + ) + + try: + yield + finally: + # Leave the factory clean for the next test regardless of what ran. + _drain_telemetry_factory()