Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/retry-flag-requests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"posthog-ruby": patch
"posthog-rails": patch
---

Retry feature flag requests on transient network errors (timeouts, connection resets) with backoff, so a one-off blip no longer surfaces a hard error to the caller. The retry count is configurable via the `feature_flag_request_max_retries` option (defaults to 1, set to 0 to opt out).
6 changes: 5 additions & 1 deletion lib/posthog/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def _decrement_instance_count(api_key)
# in seconds. Defaults to 30.
# @option opts [Integer] :feature_flag_request_timeout_seconds How long to wait for feature flag evaluation,
# in seconds. Defaults to 3.
# @option opts [Integer] :feature_flag_request_max_retries How many times to retry a flag request after a
# transient network error. Each retry sleeps on the calling thread before retrying, so this adds to
# worst-case latency. Defaults to 1. Set to 0 to disable retrying.
# @option opts [Proc] :before_send A callback that receives the event hash and should return either a modified
# hash to be sent to PostHog or nil to prevent the event from being sent. e.g. `before_send: ->(event) { event }`.
# @option opts [Boolean] :disable_singleton_warning +true+ to suppress the warning when multiple clients share
Expand Down Expand Up @@ -140,7 +143,8 @@ def initialize(opts = {})
opts[:host],
opts[:feature_flag_request_timeout_seconds] || Defaults::FeatureFlags::FLAG_REQUEST_TIMEOUT_SECONDS,
opts[:on_error],
flag_definition_cache_provider: opts[:flag_definition_cache_provider]
flag_definition_cache_provider: opts[:flag_definition_cache_provider],
feature_flag_request_max_retries: opts[:feature_flag_request_max_retries]
)
end

Expand Down
4 changes: 4 additions & 0 deletions lib/posthog/defaults.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ module Request

module FeatureFlags
FLAG_REQUEST_TIMEOUT_SECONDS = 3
# Number of retries for a flag request after a transient network error.
# Flag requests are stateless and cause no server-side mutation, so
# retrying is safe.
FLAG_REQUEST_MAX_RETRIES = 1
end

module Queue
Expand Down
43 changes: 33 additions & 10 deletions lib/posthog/feature_flags.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
require 'json'
require 'posthog/version'
require 'posthog/logging'
require 'posthog/defaults'
require 'posthog/backoff_policy'
require 'posthog/feature_flag'
require 'posthog/flag_definition_cache'
require 'digest'
Expand Down Expand Up @@ -34,14 +36,17 @@ class FeatureFlagsPoller
# @param feature_flag_request_timeout_seconds [Integer] Timeout for feature flag requests.
# @param on_error [Proc, nil] Callback invoked as `on_error.call(status, error)`.
# @param flag_definition_cache_provider [Object, nil] Optional {FlagDefinitionCacheProvider} implementation.
# @param feature_flag_request_max_retries [Integer, nil] Retries after a transient network error on a flag
# request. Defaults to {Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES}. Set to 0 to disable retrying.
def initialize(
polling_interval,
personal_api_key,
project_api_key,
host,
feature_flag_request_timeout_seconds,
on_error = nil,
flag_definition_cache_provider: nil
flag_definition_cache_provider: nil,
feature_flag_request_max_retries: nil
)
@polling_interval = polling_interval || 30
@personal_api_key = personal_api_key
Expand All @@ -53,6 +58,8 @@ def initialize(
@loaded_flags_successfully_once = Concurrent::AtomicBoolean.new
@feature_flags_by_key = nil
@feature_flag_request_timeout_seconds = feature_flag_request_timeout_seconds
@feature_flag_request_max_retries =
feature_flag_request_max_retries || Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES
@on_error = on_error || proc { |status, error| }
@quota_limited = Concurrent::AtomicBoolean.new(false)
@flags_etag = Concurrent::AtomicReference.new(nil)
Expand Down Expand Up @@ -1240,12 +1247,24 @@ def _request_remote_config_payload(flag_key)
_request(uri, req, @feature_flag_request_timeout_seconds)
end

# rubocop:disable Lint/ShadowedException
# Transient network errors that are safe to retry. Flag requests are
# retry-safe (stateless reads and evaluations, no server-side mutation), so
# a one-off blip (TCP retransmit, TLS jitter, an edge/proxy hiccup) should
# be absorbed by a retry rather than surfaced to the caller.
RETRYABLE_REQUEST_ERRORS = [
Timeout::Error, # covers Net::OpenTimeout, Net::ReadTimeout, Net::WriteTimeout
Errno::ECONNRESET,
EOFError
].freeze

def _request(uri, request_object, timeout = nil, include_etag: false)
request_object['User-Agent'] = "posthog-ruby/#{PostHog::VERSION}"
request_timeout = timeout || 10
backoff_policy = nil
attempts = 0

begin
attempts += 1
Net::HTTP.start(
uri.hostname,
uri.port,
Expand Down Expand Up @@ -1279,20 +1298,24 @@ def _request(uri, request_object, timeout = nil, include_etag: false)
return error_response
end
end
rescue Timeout::Error,
Errno::EINVAL,
Errno::ECONNRESET,
EOFError,
rescue *RETRYABLE_REQUEST_ERRORS => e
if attempts <= @feature_flag_request_max_retries
backoff_policy ||= BackoffPolicy.new
interval = backoff_policy.next_interval.to_f / 1000
logger.debug("Retrying request to #{_mask_tokens_in_url(uri.to_s)} after #{e.class} (attempt #{attempts})")
sleep(interval)
retry
end
logger.debug("Unable to complete request to #{_mask_tokens_in_url(uri.to_s)}")
raise
rescue Errno::EINVAL,
Net::HTTPBadResponse,
Net::HTTPHeaderSyntaxError,
Net::ReadTimeout,
Net::WriteTimeout,
Net::ProtocolError
logger.debug("Unable to complete request to #{uri}")
logger.debug("Unable to complete request to #{_mask_tokens_in_url(uri.to_s)}")
raise
end
end
# rubocop:enable Lint/ShadowedException

def _mask_tokens_in_url(url)
url.gsub(/token=([^&]{10})[^&]*/, 'token=\1...')
Expand Down
1 change: 1 addition & 0 deletions public_api_snapshot.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ constant PostHog::Defaults::BackoffPolicy::MIN_TIMEOUT_MS: Integer
constant PostHog::Defaults::BackoffPolicy::MULTIPLIER: Float
constant PostHog::Defaults::BackoffPolicy::RANDOMIZATION_FACTOR: Float
module PostHog::Defaults::FeatureFlags
constant PostHog::Defaults::FeatureFlags::FLAG_REQUEST_MAX_RETRIES: Integer
constant PostHog::Defaults::FeatureFlags::FLAG_REQUEST_TIMEOUT_SECONDS: Integer
constant PostHog::Defaults::MAX_HASH_SIZE: Integer
module PostHog::Defaults::Message
Expand Down
2 changes: 2 additions & 0 deletions spec/posthog/feature_flag_error_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ module PostHog

context 'when request fails completely' do
it 'adds timeout error to $feature_flag_called event on timeout' do
# A timeout is retryable now, so stub sleep to avoid real backoff delay.
allow_any_instance_of(PostHog::FeatureFlagsPoller).to receive(:sleep)
stub_request(:post, flags_endpoint)
.to_timeout

Expand Down
13 changes: 8 additions & 5 deletions spec/posthog/feature_flag_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -616,20 +616,23 @@ module PostHog
.to_raise(Net::ReadTimeout)

c = Client.new(api_key: API_KEY, personal_api_key: API_KEY, test_mode: true)
# Transient timeouts are retried, so each `/flags` call makes one extra
# request. Avoid real backoff sleeps in the test.
allow_any_instance_of(PostHog::FeatureFlagsPoller).to receive(:sleep)

# beta-feature falls back to `/flags`, which on error returns default
# beta-feature falls back to `/flags`, which on error (after a retry) returns default
expect(c.get_feature_flag('beta-feature', 'some-distinct-id')).to be(nil)
assert_requested :post, flags_endpoint, times: 1
assert_requested :post, flags_endpoint, times: 2
WebMock.reset_executed_requests!

# beta-feature2 falls back to `/flags`, which on error returns default
# beta-feature2 falls back to `/flags`, which on error (after a retry) returns default
expect(c.get_feature_flag('beta-feature2', 'some-distinct-id')).to be(nil)
expect(c.is_feature_enabled('beta-feature2', 'some-distinct-id')).to be(nil)
assert_requested :post, flags_endpoint, times: 2
assert_requested :post, flags_endpoint, times: 4
WebMock.reset_executed_requests!

expect(c.get_all_flags('some-distinct-id')).to eq({})
assert_requested :post, flags_endpoint, times: 1
assert_requested :post, flags_endpoint, times: 2
WebMock.reset_executed_requests!
end

Expand Down
84 changes: 84 additions & 0 deletions spec/posthog/flags_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,96 @@ module PostHog
end

it 'handles network timeouts' do
# A timeout is retryable now, so stub sleep to avoid real backoff delay.
allow(poller).to receive(:sleep)
stub_request(:post, flags_endpoint)
.to_timeout

expect { poller.get_flags('test-distinct-id') }.to raise_error(Timeout::Error)
end

context 'retrying transient network errors' do
let(:flags_response) { { featureFlags: { 'my-flag' => true }, featureFlagPayloads: {} } }

before do
# Avoid real backoff sleeps in tests.
allow(poller).to receive(:sleep)
end

it 'retries once and succeeds after a transient Net::ReadTimeout' do
stub_request(:post, flags_endpoint)
.to_raise(Net::ReadTimeout).then
.to_return(status: 200, body: flags_response.to_json)

result = poller.get_flags('test-distinct-id')

expect(result[:status]).to eq(200)
expect(result[:featureFlags]).to eq({ 'my-flag': true })
expect(a_request(:post, flags_endpoint)).to have_been_made.times(2)
expect(poller).to have_received(:sleep).once
end

it 'retries on Errno::ECONNRESET then re-raises once retries are exhausted' do
stub_request(:post, flags_endpoint)
.to_raise(Errno::ECONNRESET)

expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNRESET)
expect(a_request(:post, flags_endpoint)).to have_been_made.times(2)
end

it 'does not retry on a connection refused error' do
stub_request(:post, flags_endpoint)
.to_raise(Errno::ECONNREFUSED)

expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNREFUSED)
expect(a_request(:post, flags_endpoint)).to have_been_made.times(1)
end

it 'does not retry on a non-retryable network error' do
stub_request(:post, flags_endpoint)
.to_raise(Net::HTTPBadResponse)

expect { poller.get_flags('test-distinct-id') }.to raise_error(Net::HTTPBadResponse)
expect(a_request(:post, flags_endpoint)).to have_been_made.times(1)
end

context 'when feature_flag_request_max_retries is configured' do
let(:client) do
Client.new(
api_key: API_KEY,
personal_api_key: API_KEY,
test_mode: true,
feature_flag_request_max_retries: max_retries
)
end

context 'set to 0 (opt out)' do
let(:max_retries) { 0 }

it 'does not retry and re-raises the transient error immediately' do
stub_request(:post, flags_endpoint)
.to_raise(Net::ReadTimeout)

expect { poller.get_flags('test-distinct-id') }.to raise_error(Net::ReadTimeout)
expect(a_request(:post, flags_endpoint)).to have_been_made.times(1)
expect(poller).not_to have_received(:sleep)
end
end

context 'set to a higher count' do
let(:max_retries) { 3 }

it 'retries up to the configured number of times before re-raising' do
stub_request(:post, flags_endpoint)
.to_raise(Errno::ECONNRESET)

expect { poller.get_flags('test-distinct-id') }.to raise_error(Errno::ECONNRESET)
expect(a_request(:post, flags_endpoint)).to have_been_made.times(4)
end
end
end
end

it 'handles quota limited responses v3' do
quota_limited_response = {
flags: {},
Comment thread
haacked marked this conversation as resolved.
Expand Down
Loading