From ca0bafb1ddc0a13408f0c5e0a1b995d688ff6553 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Wed, 3 Jun 2026 08:55:06 +0000 Subject: [PATCH] test(retry/e2e): deflake retry-count tests by disabling telemetry The retry tests in PySQLRetryTestsMixin patch urllib3 globally (HTTPSConnectionPool._get_conn / _validate_conn) and assert the connection's own request was retried exactly N times. With telemetry on (the default), the same connection makes two kinds of side-channel HTTP calls through the same mocked pool that inflate the count and flake the assertion (e.g. `assert call_count == 6` -> AssertionError under merge-queue load): 1. A synchronous feature-flag GET during Connection.__init__ (is_telemetry_enabled -> FeatureFlagsContext._refresh_flags), and 2. Async telemetry POSTs to /telemetry-ext from the background executor and the periodic flush thread. The prior mitigation (_isolated_from_telemetry, #812) only attacked source #2 and imperfectly, so the tests kept flaking. Fix (test-only): - Add enable_telemetry=False to the shared _retry_policy dict. It short-circuits is_telemetry_enabled before the feature-flag fetch (kills #1) and installs a NoopTelemetryClient (kills #2), removing both sources at their origin without changing the retry behavior under test. Every test in the mixin merges _retry_policy, so this one line covers them all. - Harden _isolated_from_telemetry with a backstop that no-ops the actual HTTP boundary (_send_with_unified_client), closing the already-submitted-future gap for any other caller of the helper. - Add a version_string forward-compat shim to the SimpleHttpResponse mock so the suite also runs on urllib3 >= 2.3 (harmless on the 2.2.x pinned in CI). Verified against dogfood: test_oserror_retries and test_retry_max_count_not_exceeded pass 3/3 deterministically for both Thrift and SEA; the full PySQLRetryTestsMixin (36 tests) passes. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- tests/e2e/common/retry_test_mixins.py | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index 08876e145..bd4d669b4 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -77,6 +77,9 @@ def _noop_export(self, event): return None try: + def _noop_send_with_unified_client(self, *args, **kwargs): + return None + with patch.object( TelemetryClientFactory, "initialize_telemetry_client", @@ -85,6 +88,17 @@ def _noop_export(self, event): TelemetryClient, "_send_telemetry", _noop_send ), patch.object( TelemetryClient, "_export_event", _noop_export + ), patch.object( + # Backstop layer 4: no-op the actual HTTP boundary. _send_telemetry + # only suppresses NEW submissions; a future already submitted (e.g. by + # the holder.client.close() flush above, which runs before these + # patches apply) still calls _send_with_unified_client on a background + # thread and would hit the network inside the test's urllib3 mock. + # Patching the HTTP method itself closes that already-submitted-future + # gap for any caller of this helper. + TelemetryClient, + "_send_with_unified_client", + _noop_send_with_unified_client, ): yield finally: @@ -187,6 +201,10 @@ def __init__(self, status: int, headers: dict, redirect_location: Optional[str] self.msg = self.headers # For urllib3~=1.0.0 compatibility self.reason = "Mocked Response" self.version = 11 + # urllib3 >= 2.3 reads `version_string` off the raw httplib response in + # HTTPConnectionPool._make_request. Older urllib3 (incl. the 2.2.x pinned + # in CI) never touches it, so this is a harmless forward-compat shim. + self.version_string = "HTTP/1.1" self.length = 0 self.length_remaining = 0 self._redirect_location = redirect_location @@ -288,6 +306,22 @@ class PySQLRetryTestsMixin: "_retry_stop_after_attempts_count": 5, "_retry_stop_after_attempts_duration": 30, "_retry_delay_default": 0.5, + # Disable telemetry on every retry-count test. These tests patch urllib3 + # globally (HTTPSConnectionPool._get_conn / _validate_conn) and assert the + # connection's own request was retried exactly N times. With telemetry on + # (the default), the same connection makes two kinds of side-channel HTTP + # calls through the same mocked pool that inflate the count and flake the + # assertion: + # 1. A synchronous feature-flag GET during Connection.__init__ + # (is_telemetry_enabled -> FeatureFlagsContext._refresh_flags), and + # 2. Async telemetry POSTs to /telemetry-ext from the background executor + # and the periodic flush thread. + # enable_telemetry=False short-circuits is_telemetry_enabled before the + # feature-flag fetch and installs a NoopTelemetryClient, removing both + # sources at their origin without changing the retry behavior under test. + # Every test in this mixin merges _retry_policy into its connection params, + # so setting it here covers them all. See also _isolated_from_telemetry(). + "enable_telemetry": False, } @pytest.mark.parametrize(