@@ -22,18 +22,23 @@ package gce
2222import (
2323 "context"
2424 "fmt"
25+ "net/http"
2526 "reflect"
2627 "sort"
2728 "strings"
29+ "sync/atomic"
2830 "testing"
31+ "time"
2932
3033 "github.com/stretchr/testify/assert"
3134 "github.com/stretchr/testify/require"
35+ "golang.org/x/sync/errgroup"
3236
3337 "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud"
3438 "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta"
3539 "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/mock"
3640 "google.golang.org/api/compute/v1"
41+ "google.golang.org/api/googleapi"
3742 v1 "k8s.io/api/core/v1"
3843 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3944 "k8s.io/apimachinery/pkg/types"
@@ -2502,3 +2507,204 @@ func TestEnsureInternalLoadBalancerClass(t *testing.T) {
25022507 }
25032508 }
25042509}
2510+
2511+ func TestEnsureInternalBackendServiceConflict (t * testing.T ) {
2512+ t .Parallel ()
2513+
2514+ vals := DefaultTestClusterValues ()
2515+ nodeNames := []string {"test-node-1" }
2516+
2517+ gce , err := fakeGCECloud (vals )
2518+ require .NoError (t , err )
2519+
2520+ svc := fakeLoadbalancerService (string (LBTypeInternal ))
2521+ lbName := gce .GetLoadBalancerName (context .TODO (), "" , svc )
2522+ nodes , err := createAndInsertNodes (gce , nodeNames , vals .ZoneName )
2523+ require .NoError (t , err )
2524+ igName := makeInstanceGroupName (vals .ClusterID )
2525+ igLinks , err := gce .ensureInternalInstanceGroups (igName , nodes )
2526+ require .NoError (t , err )
2527+
2528+ sharedBackend := shareBackendService (svc )
2529+ bsName := makeBackendServiceName (lbName , vals .ClusterID , sharedBackend , cloud .SchemeInternal , "TCP" , svc .Spec .SessionAffinity )
2530+
2531+ // Create backend initially
2532+ err = gce .ensureInternalBackendService (bsName , "description" , svc .Spec .SessionAffinity , cloud .SchemeInternal , "TCP" , igLinks , "" )
2533+ require .NoError (t , err )
2534+
2535+ // Mock 412 error
2536+ c := gce .c .(* cloud.MockGCE )
2537+ c .MockRegionBackendServices .UpdateHook = func (ctx context.Context , key * meta.Key , obj * compute.BackendService , m * cloud.MockRegionBackendServices , options ... cloud.Option ) error {
2538+ return & googleapi.Error {Code : http .StatusPreconditionFailed , Message : "Precondition Failed" }
2539+ }
2540+
2541+ // Update the Backend Service to trigger the update hook
2542+ err = gce .ensureInternalBackendService (bsName , "description" , v1 .ServiceAffinityNone , cloud .SchemeInternal , "TCP" , igLinks , "" )
2543+
2544+ // Verify that the error is propagated
2545+ require .Error (t , err )
2546+ assert .Contains (t , err .Error (), "Precondition Failed" )
2547+ assert .IsType (t , & googleapi.Error {}, err )
2548+ if gErr , ok := err .(* googleapi.Error ); ok {
2549+ assert .Equal (t , http .StatusPreconditionFailed , gErr .Code )
2550+ }
2551+ }
2552+
2553+ func TestResourceLockErrorRecovery (t * testing.T ) {
2554+ t .Parallel ()
2555+ vals := DefaultTestClusterValues ()
2556+ gce , _ := fakeGCECloud (vals )
2557+ c := gce .c .(* cloud.MockGCE )
2558+
2559+ svc := fakeLoadbalancerService (string (LBTypeInternal ))
2560+ svcName := types.NamespacedName {Name : svc .Name , Namespace : svc .Namespace }
2561+
2562+ var calls int32
2563+ c .MockHealthChecks .InsertHook = func (ctx context.Context , key * meta.Key , obj * compute.HealthCheck , m * cloud.MockHealthChecks , options ... cloud.Option ) (bool , error ) {
2564+ if atomic .AddInt32 (& calls , 1 ) == 1 {
2565+ return true , & googleapi.Error {Code : http .StatusInternalServerError , Message : "Simulated GCP Error" }
2566+ }
2567+ return false , nil
2568+ }
2569+
2570+ // 1st request should error out and release lock
2571+ _ , err := gce .ensureInternalHealthCheck ("hc-lock-test" , svcName , true , "/" , 80 )
2572+ require .Error (t , err )
2573+ assert .Contains (t , err .Error (), "Simulated GCP Error" )
2574+
2575+ // 2nd request should successfully acquire the lock, create the health check, and succeed.
2576+ // We use a channel to ensure that if the lock was leaked, the test fails quickly instead of timing out.
2577+ errCh := make (chan error , 1 )
2578+ hcCh := make (chan * compute.HealthCheck , 1 )
2579+ go func () {
2580+ hc , err := gce .ensureInternalHealthCheck ("hc-lock-test" , svcName , true , "/" , 80 )
2581+ errCh <- err
2582+ hcCh <- hc
2583+ }()
2584+
2585+ select {
2586+ case err := <- errCh :
2587+ hc := <- hcCh
2588+ require .NoError (t , err )
2589+ assert .NotNil (t , hc )
2590+ case <- time .After (2 * time .Second ):
2591+ t .Fatal ("Deadlock detected: Second request timed out trying to acquire lock. The lock was likely leaked." )
2592+ }
2593+
2594+ assert .Equal (t , int32 (2 ), atomic .LoadInt32 (& calls ))
2595+ }
2596+
2597+ func TestEnsureInternalInstanceGroupNodeSyncScaling (t * testing.T ) {
2598+ t .Parallel ()
2599+ vals := DefaultTestClusterValues ()
2600+ gce , _ := fakeGCECloud (vals )
2601+ c := gce .c .(* cloud.MockGCE )
2602+
2603+ igName := "test-ig-node-scale"
2604+ zone := vals .ZoneName
2605+
2606+ // Inject a small sleep in Get and Insert to widen the race window.
2607+ c .MockInstanceGroups .GetHook = func (ctx context.Context , key * meta.Key , m * cloud.MockInstanceGroups , options ... cloud.Option ) (bool , * compute.InstanceGroup , error ) {
2608+ time .Sleep (2 * time .Millisecond )
2609+ return false , nil , nil
2610+ }
2611+ c .MockInstanceGroups .InsertHook = func (ctx context.Context , key * meta.Key , obj * compute.InstanceGroup , m * cloud.MockInstanceGroups , options ... cloud.Option ) (bool , error ) {
2612+ time .Sleep (2 * time .Millisecond )
2613+ return false , nil
2614+ }
2615+
2616+ var eg errgroup.Group
2617+ workers := 20
2618+
2619+ for i := 0 ; i < workers ; i ++ {
2620+ workerID := i
2621+ eg .Go (func () error {
2622+ var nodes []* v1.Node
2623+ for j := 0 ; j < (workerID % 5 )+ 1 ; j ++ {
2624+ nodeName := fmt .Sprintf ("node-%d" , j )
2625+ nodes = append (nodes , & v1.Node {
2626+ ObjectMeta : metav1.ObjectMeta {Name : nodeName },
2627+ })
2628+ }
2629+
2630+ _ , err := gce .ensureInternalInstanceGroup (igName , zone , nodes , nil )
2631+ return err
2632+ })
2633+ }
2634+
2635+ err := eg .Wait ()
2636+ require .NoError (t , err , "All workers should complete without error" )
2637+
2638+ // We verify that the final state precisely matches one of the expected valid subsets.
2639+ instances , err := gce .ListInstancesInInstanceGroup (igName , zone , "ALL" )
2640+ require .NoError (t , err )
2641+
2642+ actualNodes := make (map [string ]bool )
2643+ for _ , ins := range instances {
2644+ parts := strings .Split (ins .Instance , "/" )
2645+ actualNodes [parts [len (parts )- 1 ]] = true
2646+ }
2647+
2648+ validStates := []map [string ]bool {
2649+ {"node-0" : true },
2650+ {"node-0" : true , "node-1" : true },
2651+ {"node-0" : true , "node-1" : true , "node-2" : true },
2652+ {"node-0" : true , "node-1" : true , "node-2" : true , "node-3" : true },
2653+ {"node-0" : true , "node-1" : true , "node-2" : true , "node-3" : true , "node-4" : true },
2654+ }
2655+
2656+ isValid := false
2657+ for _ , state := range validStates {
2658+ if reflect .DeepEqual (actualNodes , state ) {
2659+ isValid = true
2660+ break
2661+ }
2662+ }
2663+ assert .True (t , isValid , "Final InstanceGroup count should precisely match exactly one of the known synchronized states, got: %v" , actualNodes )
2664+ }
2665+
2666+ func TestSharedVsNonSharedHealthCheckContention (t * testing.T ) {
2667+ t .Parallel ()
2668+ vals := DefaultTestClusterValues ()
2669+ gce , _ := fakeGCECloud (vals )
2670+ c := gce .c .(* cloud.MockGCE )
2671+
2672+ svc := fakeLoadbalancerService (string (LBTypeInternal ))
2673+ svcName := types.NamespacedName {Name : svc .Name , Namespace : svc .Namespace }
2674+
2675+ var sharedInsertCount int32
2676+
2677+ c .MockHealthChecks .InsertHook = func (ctx context.Context , key * meta.Key , obj * compute.HealthCheck , m * cloud.MockHealthChecks , options ... cloud.Option ) (bool , error ) {
2678+ time .Sleep (5 * time .Millisecond )
2679+ if obj .Name == "shared-hc" {
2680+ atomic .AddInt32 (& sharedInsertCount , 1 )
2681+ }
2682+ return false , nil
2683+ }
2684+ c .MockHealthChecks .GetHook = func (ctx context.Context , key * meta.Key , m * cloud.MockHealthChecks , options ... cloud.Option ) (bool , * compute.HealthCheck , error ) {
2685+ time .Sleep (5 * time .Millisecond )
2686+ return false , nil , nil
2687+ }
2688+
2689+ var eg errgroup.Group
2690+ workers := 50
2691+
2692+ for i := 0 ; i < workers ; i ++ {
2693+ workerID := i
2694+ eg .Go (func () error {
2695+ if workerID % 2 == 0 {
2696+ _ , err := gce .ensureInternalHealthCheck ("shared-hc" , svcName , true , "/" , 80 )
2697+ return err
2698+ } else {
2699+ hcName := fmt .Sprintf ("unique-hc-%d" , workerID )
2700+ _ , err := gce .ensureInternalHealthCheck (hcName , svcName , false , "/" , 80 )
2701+ return err
2702+ }
2703+ })
2704+ }
2705+
2706+ err := eg .Wait ()
2707+ require .NoError (t , err , "All health check routines should complete without error" )
2708+
2709+ assert .Equal (t , int32 (1 ), atomic .LoadInt32 (& sharedInsertCount ), "Shared health check should only be inserted exactly once" )
2710+ }
0 commit comments