From 75b862649c40ebef203f375863c16d81c346eb51 Mon Sep 17 00:00:00 2001 From: Phil Haack Date: Wed, 24 Jun 2026 15:15:52 -0700 Subject: [PATCH 1/5] Retry flag requests on transient network errors --- .changeset/retry-flag-requests.md | 6 +++++ lib/posthog/defaults.rb | 4 ++++ lib/posthog/feature_flags.rb | 36 +++++++++++++++++++++++------- public_api_snapshot.txt | 1 + spec/posthog/feature_flag_spec.rb | 13 ++++++----- spec/posthog/flags_spec.rb | 37 +++++++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 13 deletions(-) create mode 100644 .changeset/retry-flag-requests.md diff --git a/.changeset/retry-flag-requests.md b/.changeset/retry-flag-requests.md new file mode 100644 index 0000000..85c1fbe --- /dev/null +++ b/.changeset/retry-flag-requests.md @@ -0,0 +1,6 @@ +--- +"posthog-ruby": patch +"posthog-rails": patch +--- + +Retry feature flag requests on transient network errors (timeouts, connection resets) with backoff, so a one-off blip no longer surfaces a hard error to the caller. diff --git a/lib/posthog/defaults.rb b/lib/posthog/defaults.rb index db1e624..eea7666 100644 --- a/lib/posthog/defaults.rb +++ b/lib/posthog/defaults.rb @@ -19,6 +19,10 @@ module Request module FeatureFlags FLAG_REQUEST_TIMEOUT_SECONDS = 3 + # Number of retries for a flag request after a transient network error. + # Flag requests are stateless and cause no server-side mutation, so + # retrying is safe. + FLAG_REQUEST_MAX_RETRIES = 1 end module Queue diff --git a/lib/posthog/feature_flags.rb b/lib/posthog/feature_flags.rb index a801906..ec977ba 100644 --- a/lib/posthog/feature_flags.rb +++ b/lib/posthog/feature_flags.rb @@ -5,6 +5,8 @@ require 'json' require 'posthog/version' require 'posthog/logging' +require 'posthog/defaults' +require 'posthog/backoff_policy' require 'posthog/feature_flag' require 'posthog/flag_definition_cache' require 'digest' @@ -1240,12 +1242,26 @@ def _request_remote_config_payload(flag_key) _request(uri, req, @feature_flag_request_timeout_seconds) end - # rubocop:disable Lint/ShadowedException + # Transient network errors that are safe to retry. Flag requests are + # retry-safe (stateless reads and evaluations, no server-side mutation), so + # a one-off blip (TCP retransmit, TLS jitter, an edge/proxy hiccup) should + # be absorbed by a retry rather than surfaced to the caller. + RETRYABLE_REQUEST_ERRORS = [ + Timeout::Error, # includes Net::OpenTimeout + Errno::ECONNRESET, + EOFError, + Net::ReadTimeout, + Net::WriteTimeout + ].freeze + def _request(uri, request_object, timeout = nil, include_etag: false) request_object['User-Agent'] = "posthog-ruby/#{PostHog::VERSION}" request_timeout = timeout || 10 + backoff_policy = nil + attempts = 0 begin + attempts += 1 Net::HTTP.start( uri.hostname, uri.port, @@ -1279,20 +1295,24 @@ def _request(uri, request_object, timeout = nil, include_etag: false) return error_response end end - rescue Timeout::Error, - Errno::EINVAL, - Errno::ECONNRESET, - EOFError, + rescue *RETRYABLE_REQUEST_ERRORS => e + if attempts <= Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES + backoff_policy ||= BackoffPolicy.new + interval = backoff_policy.next_interval.to_f / 1000 + logger.debug("Retrying request to #{_mask_tokens_in_url(uri.to_s)} after #{e.class} (attempt #{attempts})") + sleep(interval) + retry + end + logger.debug("Unable to complete request to #{uri}") + raise + rescue Errno::EINVAL, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, - Net::ReadTimeout, - Net::WriteTimeout, Net::ProtocolError logger.debug("Unable to complete request to #{uri}") raise end end - # rubocop:enable Lint/ShadowedException def _mask_tokens_in_url(url) url.gsub(/token=([^&]{10})[^&]*/, 'token=\1...') diff --git a/public_api_snapshot.txt b/public_api_snapshot.txt index 2a4d4c9..ad0eff5 100644 --- a/public_api_snapshot.txt +++ b/public_api_snapshot.txt @@ -34,6 +34,7 @@ constant PostHog::Defaults::BackoffPolicy::MIN_TIMEOUT_MS: Integer constant PostHog::Defaults::BackoffPolicy::MULTIPLIER: Float constant PostHog::Defaults::BackoffPolicy::RANDOMIZATION_FACTOR: Float module PostHog::Defaults::FeatureFlags +constant PostHog::Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES: Integer constant PostHog::Defaults::FeatureFlags::FLAG_REQUEST_TIMEOUT_SECONDS: Integer constant PostHog::Defaults::MAX_HASH_SIZE: Integer module PostHog::Defaults::Message diff --git a/spec/posthog/feature_flag_spec.rb b/spec/posthog/feature_flag_spec.rb index 4cdf8c1..56dc740 100644 --- a/spec/posthog/feature_flag_spec.rb +++ b/spec/posthog/feature_flag_spec.rb @@ -616,20 +616,23 @@ module PostHog .to_raise(Net::ReadTimeout) c = Client.new(api_key: API_KEY, personal_api_key: API_KEY, test_mode: true) + # Transient timeouts are retried, so each `/flags` call makes one extra + # request. Avoid real backoff sleeps in the test. + allow_any_instance_of(PostHog::FeatureFlagsPoller).to receive(:sleep) - # beta-feature falls back to `/flags`, which on error returns default + # beta-feature falls back to `/flags`, which on error (after a retry) returns default expect(c.get_feature_flag('beta-feature', 'some-distinct-id')).to be(nil) - assert_requested :post, flags_endpoint, times: 1 + assert_requested :post, flags_endpoint, times: 2 WebMock.reset_executed_requests! - # beta-feature2 falls back to `/flags`, which on error returns default + # beta-feature2 falls back to `/flags`, which on error (after a retry) returns default expect(c.get_feature_flag('beta-feature2', 'some-distinct-id')).to be(nil) expect(c.is_feature_enabled('beta-feature2', 'some-distinct-id')).to be(nil) - assert_requested :post, flags_endpoint, times: 2 + assert_requested :post, flags_endpoint, times: 4 WebMock.reset_executed_requests! expect(c.get_all_flags('some-distinct-id')).to eq({}) - assert_requested :post, flags_endpoint, times: 1 + assert_requested :post, flags_endpoint, times: 2 WebMock.reset_executed_requests! end diff --git a/spec/posthog/flags_spec.rb b/spec/posthog/flags_spec.rb index 3b2dd31..fcc2864 100644 --- a/spec/posthog/flags_spec.rb +++ b/spec/posthog/flags_spec.rb @@ -230,6 +230,43 @@ module PostHog expect { poller.get_flags('test-distinct-id') }.to raise_error(Timeout::Error) end + context 'retrying transient network errors' do + let(:flags_response) { { featureFlags: { 'my-flag' => true }, featureFlagPayloads: {} } } + + before do + # Avoid real backoff sleeps in tests. + allow(poller).to receive(:sleep) + end + + it 'retries once and succeeds after a transient Net::ReadTimeout' do + stub_request(:post, flags_endpoint) + .to_raise(Net::ReadTimeout).then + .to_return(status: 200, body: flags_response.to_json) + + result = poller.get_flags('test-distinct-id') + + expect(result[:status]).to eq(200) + expect(result[:featureFlags]).to eq({ 'my-flag': true }) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(2) + end + + it 'retries on Errno::ECONNRESET then re-raises once retries are exhausted' do + stub_request(:post, flags_endpoint) + .to_raise(Errno::ECONNRESET) + + expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNRESET) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(2) + end + + it 'does not retry on a connection refused error' do + stub_request(:post, flags_endpoint) + .to_raise(Errno::ECONNREFUSED) + + expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNREFUSED) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(1) + end + end + it 'handles quota limited responses v3' do quota_limited_response = { flags: {}, From 7d37e259c4971ddf45ba7a8cc05b8dc9ab6153d0 Mon Sep 17 00:00:00 2001 From: Phil Haack Date: Wed, 24 Jun 2026 15:21:00 -0700 Subject: [PATCH 2/5] Mask tokens in exhausted-retry failure log Fixes #195 --- lib/posthog/feature_flags.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/posthog/feature_flags.rb b/lib/posthog/feature_flags.rb index ec977ba..42971b5 100644 --- a/lib/posthog/feature_flags.rb +++ b/lib/posthog/feature_flags.rb @@ -1303,7 +1303,7 @@ def _request(uri, request_object, timeout = nil, include_etag: false) sleep(interval) retry end - logger.debug("Unable to complete request to #{uri}") + logger.debug("Unable to complete request to #{_mask_tokens_in_url(uri.to_s)}") raise rescue Errno::EINVAL, Net::HTTPBadResponse, From 2ea40f49d6cee1468ff2af923e1bca9abf97fc2a Mon Sep 17 00:00:00 2001 From: Phil Haack Date: Wed, 24 Jun 2026 15:24:50 -0700 Subject: [PATCH 3/5] Mask tokens in non-retryable error failure log --- lib/posthog/feature_flags.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/posthog/feature_flags.rb b/lib/posthog/feature_flags.rb index 42971b5..a256da5 100644 --- a/lib/posthog/feature_flags.rb +++ b/lib/posthog/feature_flags.rb @@ -1309,7 +1309,7 @@ def _request(uri, request_object, timeout = nil, include_etag: false) Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError - logger.debug("Unable to complete request to #{uri}") + logger.debug("Unable to complete request to #{_mask_tokens_in_url(uri.to_s)}") raise end end From 7f0d7f1bacbc0947d1ca3cc82052b018a5a82279 Mon Sep 17 00:00:00 2001 From: Phil Haack Date: Wed, 24 Jun 2026 15:44:33 -0700 Subject: [PATCH 4/5] Simplify retryable request errors and improve test coverage Removes redundant error entries from RETRYABLE_REQUEST_ERRORS that are already covered by Timeout::Error (Net::ReadTimeout, Net::WriteTimeout). Updates the comment to clarify that Timeout::Error covers all timeout types. Improves test coverage by: - Stubbing sleep() calls to prevent backoff delays in tests - Adding expectation to verify retry sleep is called - Adding test case for non-retryable network errors --- lib/posthog/feature_flags.rb | 6 ++---- spec/posthog/feature_flag_error_spec.rb | 2 ++ spec/posthog/flags_spec.rb | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/posthog/feature_flags.rb b/lib/posthog/feature_flags.rb index a256da5..b110c5e 100644 --- a/lib/posthog/feature_flags.rb +++ b/lib/posthog/feature_flags.rb @@ -1247,11 +1247,9 @@ def _request_remote_config_payload(flag_key) # a one-off blip (TCP retransmit, TLS jitter, an edge/proxy hiccup) should # be absorbed by a retry rather than surfaced to the caller. RETRYABLE_REQUEST_ERRORS = [ - Timeout::Error, # includes Net::OpenTimeout + Timeout::Error, # covers Net::OpenTimeout, Net::ReadTimeout, Net::WriteTimeout Errno::ECONNRESET, - EOFError, - Net::ReadTimeout, - Net::WriteTimeout + EOFError ].freeze def _request(uri, request_object, timeout = nil, include_etag: false) diff --git a/spec/posthog/feature_flag_error_spec.rb b/spec/posthog/feature_flag_error_spec.rb index 4a5b05d..888a8f4 100644 --- a/spec/posthog/feature_flag_error_spec.rb +++ b/spec/posthog/feature_flag_error_spec.rb @@ -145,6 +145,8 @@ module PostHog context 'when request fails completely' do it 'adds timeout error to $feature_flag_called event on timeout' do + # A timeout is retryable now, so stub sleep to avoid real backoff delay. + allow_any_instance_of(PostHog::FeatureFlagsPoller).to receive(:sleep) stub_request(:post, flags_endpoint) .to_timeout diff --git a/spec/posthog/flags_spec.rb b/spec/posthog/flags_spec.rb index fcc2864..54d4575 100644 --- a/spec/posthog/flags_spec.rb +++ b/spec/posthog/flags_spec.rb @@ -224,6 +224,8 @@ module PostHog end it 'handles network timeouts' do + # A timeout is retryable now, so stub sleep to avoid real backoff delay. + allow(poller).to receive(:sleep) stub_request(:post, flags_endpoint) .to_timeout @@ -248,6 +250,7 @@ module PostHog expect(result[:status]).to eq(200) expect(result[:featureFlags]).to eq({ 'my-flag': true }) expect(a_request(:post, flags_endpoint)).to have_been_made.times(2) + expect(poller).to have_received(:sleep).once end it 'retries on Errno::ECONNRESET then re-raises once retries are exhausted' do @@ -265,6 +268,14 @@ module PostHog expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNREFUSED) expect(a_request(:post, flags_endpoint)).to have_been_made.times(1) end + + it 'does not retry on a non-retryable network error' do + stub_request(:post, flags_endpoint) + .to_raise(Net::HTTPBadResponse) + + expect { poller.get_flags('test-distinct-id') }.to raise_error(Net::HTTPBadResponse) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(1) + end end it 'handles quota limited responses v3' do From 1e1957af5f7bdca17a0da90d8fc4fab4223615c2 Mon Sep 17 00:00:00 2001 From: Phil Haack Date: Thu, 25 Jun 2026 09:10:49 -0700 Subject: [PATCH 5/5] Make feature flag request retry count configurable Adds a new `feature_flag_request_max_retries` option that lets callers control how many times to retry feature flag requests after a transient network error. Defaults to 1 (retry once) and can be set to 0 to disable retrying. Each retry sleeps on the calling thread before retrying, so higher values add to worst-case latency. Includes tests for the new configurable behavior. --- .changeset/retry-flag-requests.md | 2 +- lib/posthog/client.rb | 6 +++++- lib/posthog/feature_flags.rb | 9 ++++++-- spec/posthog/flags_spec.rb | 36 +++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/.changeset/retry-flag-requests.md b/.changeset/retry-flag-requests.md index 85c1fbe..d0f4a29 100644 --- a/.changeset/retry-flag-requests.md +++ b/.changeset/retry-flag-requests.md @@ -3,4 +3,4 @@ "posthog-rails": patch --- -Retry feature flag requests on transient network errors (timeouts, connection resets) with backoff, so a one-off blip no longer surfaces a hard error to the caller. +Retry feature flag requests on transient network errors (timeouts, connection resets) with backoff, so a one-off blip no longer surfaces a hard error to the caller. The retry count is configurable via the `feature_flag_request_max_retries` option (defaults to 1, set to 0 to opt out). diff --git a/lib/posthog/client.rb b/lib/posthog/client.rb index 014badc..c87f23f 100644 --- a/lib/posthog/client.rb +++ b/lib/posthog/client.rb @@ -67,6 +67,9 @@ def _decrement_instance_count(api_key) # in seconds. Defaults to 30. # @option opts [Integer] :feature_flag_request_timeout_seconds How long to wait for feature flag evaluation, # in seconds. Defaults to 3. + # @option opts [Integer] :feature_flag_request_max_retries How many times to retry a flag request after a + # transient network error. Each retry sleeps on the calling thread before retrying, so this adds to + # worst-case latency. Defaults to 1. Set to 0 to disable retrying. # @option opts [Proc] :before_send A callback that receives the event hash and should return either a modified # hash to be sent to PostHog or nil to prevent the event from being sent. e.g. `before_send: ->(event) { event }`. # @option opts [Boolean] :disable_singleton_warning +true+ to suppress the warning when multiple clients share @@ -140,7 +143,8 @@ def initialize(opts = {}) opts[:host], opts[:feature_flag_request_timeout_seconds] || Defaults::FeatureFlags::FLAG_REQUEST_TIMEOUT_SECONDS, opts[:on_error], - flag_definition_cache_provider: opts[:flag_definition_cache_provider] + flag_definition_cache_provider: opts[:flag_definition_cache_provider], + feature_flag_request_max_retries: opts[:feature_flag_request_max_retries] ) end diff --git a/lib/posthog/feature_flags.rb b/lib/posthog/feature_flags.rb index b110c5e..4490652 100644 --- a/lib/posthog/feature_flags.rb +++ b/lib/posthog/feature_flags.rb @@ -36,6 +36,8 @@ class FeatureFlagsPoller # @param feature_flag_request_timeout_seconds [Integer] Timeout for feature flag requests. # @param on_error [Proc, nil] Callback invoked as `on_error.call(status, error)`. # @param flag_definition_cache_provider [Object, nil] Optional {FlagDefinitionCacheProvider} implementation. + # @param feature_flag_request_max_retries [Integer, nil] Retries after a transient network error on a flag + # request. Defaults to {Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES}. Set to 0 to disable retrying. def initialize( polling_interval, personal_api_key, @@ -43,7 +45,8 @@ def initialize( host, feature_flag_request_timeout_seconds, on_error = nil, - flag_definition_cache_provider: nil + flag_definition_cache_provider: nil, + feature_flag_request_max_retries: nil ) @polling_interval = polling_interval || 30 @personal_api_key = personal_api_key @@ -55,6 +58,8 @@ def initialize( @loaded_flags_successfully_once = Concurrent::AtomicBoolean.new @feature_flags_by_key = nil @feature_flag_request_timeout_seconds = feature_flag_request_timeout_seconds + @feature_flag_request_max_retries = + feature_flag_request_max_retries || Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES @on_error = on_error || proc { |status, error| } @quota_limited = Concurrent::AtomicBoolean.new(false) @flags_etag = Concurrent::AtomicReference.new(nil) @@ -1294,7 +1299,7 @@ def _request(uri, request_object, timeout = nil, include_etag: false) end end rescue *RETRYABLE_REQUEST_ERRORS => e - if attempts <= Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES + if attempts <= @feature_flag_request_max_retries backoff_policy ||= BackoffPolicy.new interval = backoff_policy.next_interval.to_f / 1000 logger.debug("Retrying request to #{_mask_tokens_in_url(uri.to_s)} after #{e.class} (attempt #{attempts})") diff --git a/spec/posthog/flags_spec.rb b/spec/posthog/flags_spec.rb index 54d4575..f198803 100644 --- a/spec/posthog/flags_spec.rb +++ b/spec/posthog/flags_spec.rb @@ -276,6 +276,42 @@ module PostHog expect { poller.get_flags('test-distinct-id') }.to raise_error(Net::HTTPBadResponse) expect(a_request(:post, flags_endpoint)).to have_been_made.times(1) end + + context 'when feature_flag_request_max_retries is configured' do + let(:client) do + Client.new( + api_key: API_KEY, + personal_api_key: API_KEY, + test_mode: true, + feature_flag_request_max_retries: max_retries + ) + end + + context 'set to 0 (opt out)' do + let(:max_retries) { 0 } + + it 'does not retry and re-raises the transient error immediately' do + stub_request(:post, flags_endpoint) + .to_raise(Net::ReadTimeout) + + expect { poller.get_flags('test-distinct-id') }.to raise_error(Net::ReadTimeout) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(1) + expect(poller).not_to have_received(:sleep) + end + end + + context 'set to a higher count' do + let(:max_retries) { 3 } + + it 'retries up to the configured number of times before re-raising' do + stub_request(:post, flags_endpoint) + .to_raise(Errno::ECONNRESET) + + expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNRESET) + expect(a_request(:post, flags_endpoint)).to have_been_made.times(4) + end + end + end end it 'handles quota limited responses v3' do