Skip to content

Commit 9b5a6c8

Browse files
committed
azure: Switch to user-assigned managed identity for control-plane
Azure RBAC role assignment propagation can take minutes after creation. With system-assigned identity, the VMSS must be created first to get a PrincipalID, then the role assignment is created, and then RBAC must propagate before nodeup can read from blob storage. This causes 3-4 minutes of 403 errors during bootstrap. The switch to user-assigned managed identity allows the identity and role assignments to be created before the VMSS. By the time VMs boot, RBAC should already be propagated, eliminating the delay. Signed-off-by: Ciprian Hacman <ciprian@hakman.dev>
1 parent ac593b2 commit 9b5a6c8

37 files changed

+2982
-150
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ require (
105105
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
106106
dario.cat/mergo v1.0.1 // indirect
107107
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
108+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 // indirect
108109
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
109110
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
110111
github.com/GoogleCloudPlatform/k8s-cloud-provider v1.25.0 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0 h1:2qsI
2828
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0/go.mod h1:AW8VEadnhw9xox+VaVd9sP7NjzOAnaZBLRH6Tq3cJ38=
2929
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/managementgroups/armmanagementgroups v1.0.0 h1:pPvTJ1dY0sA35JOeFq6TsY2xj6Z85Yo23Pj4wCCvu4o=
3030
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/managementgroups/armmanagementgroups v1.0.0/go.mod h1:mLfWfj8v3jfWKsL9G4eoBoXVcsqcIUTapmdKy7uGOp0=
31+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 h1:L7G3dExHBgUxsO3qpTGhk/P2dgnYyW48yn7AO33Tbek=
32+
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0/go.mod h1:Ms6gYEy0+A2knfKrwdatsggTXYA2+ICKug8w7STorFw=
3133
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.1.0 h1:QM6sE5k2ZT/vI5BEe0r7mqjsUSnhVBFbOsVkEuaEfiA=
3234
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.1.0/go.mod h1:243D9iHbcQXoFUtgHJwL7gl2zx1aDuDMjvBZVGr2uW0=
3335
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 h1:Dd+RhdJn0OTtVGaeDLZpcumkIVCtA/3/Fo42+eoYvVM=

pkg/model/azuremodel/context.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,22 +97,32 @@ func (c *AzureModelContext) LinkToApplicationSecurityGroupNodes() *azuretasks.Ap
9797
return &azuretasks.ApplicationSecurityGroup{Name: fi.PtrTo(c.NameForApplicationSecurityGroupNodes())}
9898
}
9999

100-
// CloudTagsForInstanceGroup computes the tags to apply to instances in the specified InstanceGroup
101-
// Mostly copied from pkg/model/context.go, but "/" in tag keys are replaced with "_" as Azure
102-
// doesn't allow "/" in tag keys.
103-
func (c *AzureModelContext) CloudTagsForInstanceGroup(ig *kops.InstanceGroup) map[string]*string {
100+
// CloudTagsForClusterResource returns tags for a cluster-scoped resource
101+
// (one that is not tied to a specific instance group).
102+
func (c *AzureModelContext) CloudTagsForClusterResource() map[string]*string {
103+
labels := make(map[string]string)
104+
for k, v := range c.Cluster.Spec.CloudLabels {
105+
labels[k] = v
106+
}
107+
labels[azure.TagClusterName] = c.ClusterName()
108+
return toAzureTags(labels)
109+
}
110+
111+
// CloudTagsForInstanceGroupResource computes the tags to apply to instances in the specified InstanceGroup.
112+
// "/" in tag keys are replaced with "_" as Azure doesn't allow "/" in tag keys.
113+
func (c *AzureModelContext) CloudTagsForInstanceGroupResource(ig *kops.InstanceGroup) map[string]*string {
104114
const (
105115
clusterNodeTemplateLabel = "k8s.io_cluster_node-template_label_"
106116
clusterNodeTemplateTaint = "k8s.io_cluster_node-template_taint_"
107117
)
108118

109119
labels := make(map[string]string)
110-
// Apply any user-specified global labels first so they can be overridden by IG-specific labels.
120+
// Apply any user-specified global labels first (may be overridden by IG-level labels).
111121
for k, v := range c.Cluster.Spec.CloudLabels {
112122
labels[k] = v
113123
}
114124

115-
// Apply any user-specified labels.
125+
// Apply any user-specified IG labels (may override cluster-level labels).
116126
for k, v := range ig.Spec.CloudLabels {
117127
labels[k] = v
118128
}
@@ -134,7 +144,8 @@ func (c *AzureModelContext) CloudTagsForInstanceGroup(ig *kops.InstanceGroup) ma
134144
}
135145
}
136146

137-
// The system tags take priority because the cluster likely breaks without them...
147+
// System tags take priority because the cluster likely breaks without them.
148+
labels[azure.TagClusterName] = c.ClusterName()
138149
labels[azure.TagNameRolePrefix+ig.Spec.Role.ToLowerString()] = "1"
139150
if ig.Spec.Role == kops.InstanceGroupRoleControlPlane {
140151
labels[azure.TagNameRolePrefix+"master"] = "1"
@@ -143,7 +154,10 @@ func (c *AzureModelContext) CloudTagsForInstanceGroup(ig *kops.InstanceGroup) ma
143154
// Set the tag used by kops-controller to identify the instance group to which the VM ScaleSet belongs.
144155
labels[nodeidentityazure.InstanceGroupNameTag] = ig.Name
145156

146-
// Replace all "/" with "_" as "/" is not an allowed key character in Azure.
157+
return toAzureTags(labels)
158+
}
159+
160+
func toAzureTags(labels map[string]string) map[string]*string {
147161
m := make(map[string]*string)
148162
for k, v := range labels {
149163
m[strings.ReplaceAll(k, "/", "_")] = fi.PtrTo(v)

pkg/model/azuremodel/context_test.go

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ package azuremodel
1818

1919
import (
2020
"reflect"
21+
"strings"
2122
"testing"
2223

2324
"k8s.io/kops/upup/pkg/fi"
2425
)
2526

26-
func TestCloudTagsForInstanceGroup(t *testing.T) {
27+
func TestCloudTagsForInstanceGroupResource(t *testing.T) {
2728
c := newTestAzureModelContext()
2829
c.Cluster.Spec.CloudLabels = map[string]string{
2930
"cluster_label_key": "cluster_label_value",
@@ -41,7 +42,7 @@ func TestCloudTagsForInstanceGroup(t *testing.T) {
4142
"taint_key=taint_value",
4243
}
4344

44-
actual := c.CloudTagsForInstanceGroup(c.InstanceGroups[0])
45+
actual := c.CloudTagsForInstanceGroupResource(c.InstanceGroups[0])
4546
expected := map[string]*string{
4647
"cluster_label_key": fi.PtrTo("cluster_label_value"),
4748
"ig_label_key": fi.PtrTo("ig_label_value"),
@@ -50,9 +51,57 @@ func TestCloudTagsForInstanceGroup(t *testing.T) {
5051
"k8s.io_cluster_node-template_taint_taint_key": fi.PtrTo("taint_value"),
5152
"k8s.io_role_node": fi.PtrTo("1"),
5253
"kops.k8s.io_instancegroup": fi.PtrTo("nodes"),
54+
"KubernetesCluster": fi.PtrTo("testcluster.test.com"),
5355
}
5456

5557
if !reflect.DeepEqual(actual, expected) {
5658
t.Errorf("expected tags %+v, but got %+v", expected, actual)
5759
}
5860
}
61+
62+
func TestCloudTagsForClusterResource(t *testing.T) {
63+
c := newTestAzureModelContext()
64+
c.Cluster.ObjectMeta.Name = "my.k8s"
65+
c.Cluster.Spec.CloudLabels = map[string]string{
66+
"cluster_label_key": "cluster_label_value",
67+
"node_label/key": "node_label_value",
68+
}
69+
70+
actual := c.CloudTagsForClusterResource()
71+
expected := map[string]*string{
72+
"cluster_label_key": fi.PtrTo("cluster_label_value"),
73+
"node_label_key": fi.PtrTo("node_label_value"),
74+
"KubernetesCluster": fi.PtrTo("my.k8s"),
75+
}
76+
77+
if !reflect.DeepEqual(actual, expected) {
78+
t.Errorf("expected tags %+v, but got %+v", expected, actual)
79+
}
80+
}
81+
82+
func TestSanitizeUserAssignedManagedIdentityName(t *testing.T) {
83+
t.Run("preserves valid names", func(t *testing.T) {
84+
const name = "nodepool-1"
85+
if got := sanitizeUserAssignedManagedIdentityName(name); got != name {
86+
t.Fatalf("expected %q, but got %q", name, got)
87+
}
88+
})
89+
90+
t.Run("replaces invalid characters", func(t *testing.T) {
91+
got := sanitizeUserAssignedManagedIdentityName("my.cluster.example.com")
92+
if strings.Contains(got, ".") {
93+
t.Fatalf("expected dots to be replaced, got %q", got)
94+
}
95+
if got != "my-cluster-example-com" {
96+
t.Fatalf("expected %q, but got %q", "my-cluster-example-com", got)
97+
}
98+
})
99+
100+
t.Run("truncates long names", func(t *testing.T) {
101+
name := strings.Repeat("a", maxUserAssignedManagedIdentityNameLength+1)
102+
got := sanitizeUserAssignedManagedIdentityName(name)
103+
if len(got) > maxUserAssignedManagedIdentityNameLength {
104+
t.Fatalf("expected name length <= %d, got %d (%q)", maxUserAssignedManagedIdentityNameLength, len(got), got)
105+
}
106+
})
107+
}

pkg/model/azuremodel/names.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
Copyright 2026 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package azuremodel
18+
19+
import (
20+
"regexp"
21+
22+
"k8s.io/kops/pkg/truncate"
23+
)
24+
25+
// Azure user-assigned identity names must start with a letter or number, contain only alphanumerics, hyphens, and underscores,
26+
// and are limited to 128 characters. See:
27+
// https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/managed-identity-best-practice-recommendations
28+
const maxUserAssignedManagedIdentityNameLength = 128
29+
30+
var userAssignedManagedIdentityNameInvalidCharacters = regexp.MustCompile(`[^A-Za-z0-9_-]+`)
31+
32+
func sanitizeUserAssignedManagedIdentityName(name string) string {
33+
sanitized := userAssignedManagedIdentityNameInvalidCharacters.ReplaceAllString(name, "-")
34+
return truncate.TruncateString(sanitized, truncate.TruncateStringOptions{
35+
MaxLength: maxUserAssignedManagedIdentityNameLength,
36+
})
37+
}
38+
39+
func (c *AzureModelContext) NameForUserAssignedManagedIdentityControlPlane() string {
40+
return sanitizeUserAssignedManagedIdentityName(c.ClusterName())
41+
}

pkg/model/azuremodel/vmscaleset.go

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -54,43 +54,60 @@ func (b *VMScaleSetModelBuilder) Build(c *fi.CloudupModelBuilderContext) error {
5454
Tags: map[string]*string{},
5555
})
5656

57+
controlPlaneIdentity := b.addControlPlaneManagedIdentityTasks(c)
58+
5759
for _, ig := range b.InstanceGroups {
5860
name := b.AutoscalingGroupName(ig)
5961
vmss, err := b.buildVMScaleSetTask(c, name, ig)
6062
if err != nil {
6163
return err
6264
}
63-
c.AddTask(vmss)
6465

6566
if ig.IsControlPlane() || b.Cluster.UsesLegacyGossip() {
66-
// Create tasks for assigning built-in roles to VM Scale Sets.
67-
// See https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles
68-
resourceGroupID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s",
69-
b.Cluster.Spec.CloudProvider.Azure.SubscriptionID,
70-
b.Cluster.AzureResourceGroupName(),
71-
)
72-
c.AddTask(&azuretasks.RoleAssignment{
73-
Name: to.Ptr(fmt.Sprintf("%s-%s", *vmss.Name, "owner")),
74-
Lifecycle: b.Lifecycle,
75-
Scope: to.Ptr(resourceGroupID),
76-
VMScaleSet: vmss,
77-
// Owner
78-
RoleDefID: to.Ptr("8e3af657-a8ff-443c-a75c-2fe8c4bcb635"),
79-
})
80-
c.AddTask(&azuretasks.RoleAssignment{
81-
Name: to.Ptr(fmt.Sprintf("%s-%s", *vmss.Name, "blob")),
82-
Lifecycle: b.Lifecycle,
83-
Scope: to.Ptr(b.Cluster.Spec.CloudProvider.Azure.StorageAccountID),
84-
VMScaleSet: vmss,
85-
// Storage Blob Data Contributor
86-
RoleDefID: to.Ptr("ba92f5b4-2d11-453d-a403-e96b0029c9fe"),
87-
})
67+
vmss.ManagedIdentities = []*azuretasks.ManagedIdentity{controlPlaneIdentity}
8868
}
69+
70+
c.AddTask(vmss)
8971
}
9072

9173
return nil
9274
}
9375

76+
func (b *VMScaleSetModelBuilder) addControlPlaneManagedIdentityTasks(c *fi.CloudupModelBuilderContext) *azuretasks.ManagedIdentity {
77+
// Create tasks for assigning built-in roles to the shared control-plane managed identity.
78+
// See https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles
79+
controlPlaneIdentity := &azuretasks.ManagedIdentity{
80+
Name: fi.PtrTo(b.NameForUserAssignedManagedIdentityControlPlane()),
81+
Lifecycle: b.Lifecycle,
82+
ResourceGroup: b.LinkToResourceGroup(),
83+
Tags: b.CloudTagsForClusterResource(),
84+
}
85+
c.AddTask(controlPlaneIdentity)
86+
87+
resourceGroupID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s",
88+
b.Cluster.Spec.CloudProvider.Azure.SubscriptionID,
89+
b.Cluster.AzureResourceGroupName(),
90+
)
91+
// Owner role on the resource group.
92+
c.AddTask(&azuretasks.RoleAssignment{
93+
Name: to.Ptr(fmt.Sprintf("owner-%s", *controlPlaneIdentity.Name)),
94+
Lifecycle: b.Lifecycle,
95+
Scope: to.Ptr(resourceGroupID),
96+
ManagedIdentity: controlPlaneIdentity,
97+
RoleDefID: to.Ptr("8e3af657-a8ff-443c-a75c-2fe8c4bcb635"),
98+
})
99+
// Storage Blob Data Contributor role on the storage account.
100+
c.AddTask(&azuretasks.RoleAssignment{
101+
Name: to.Ptr(fmt.Sprintf("blob-%s", *controlPlaneIdentity.Name)),
102+
Lifecycle: b.Lifecycle,
103+
Scope: to.Ptr(b.Cluster.Spec.CloudProvider.Azure.StorageAccountID),
104+
ManagedIdentity: controlPlaneIdentity,
105+
RoleDefID: to.Ptr("ba92f5b4-2d11-453d-a403-e96b0029c9fe"),
106+
})
107+
108+
return controlPlaneIdentity
109+
}
110+
94111
func (b *VMScaleSetModelBuilder) buildVMScaleSetTask(
95112
c *fi.CloudupModelBuilderContext,
96113
name string,
@@ -176,7 +193,7 @@ func (b *VMScaleSetModelBuilder) buildVMScaleSetTask(
176193
}
177194
}
178195

179-
t.Tags = b.CloudTagsForInstanceGroup(ig)
196+
t.Tags = b.CloudTagsForInstanceGroupResource(ig)
180197

181198
return t, nil
182199
}

pkg/model/azuremodel/vmscaleset_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,16 @@ func TestVMScaleSetModelBuilder_Build(t *testing.T) {
7979
if err != nil {
8080
t.Errorf("unexpected error %s", err)
8181
}
82+
expectedName := b.NameForUserAssignedManagedIdentityControlPlane()
83+
for _, taskKey := range []string{
84+
"ManagedIdentity/" + expectedName,
85+
"RoleAssignment/owner-" + expectedName,
86+
"RoleAssignment/blob-" + expectedName,
87+
} {
88+
if _, ok := c.Tasks[taskKey]; !ok {
89+
t.Fatalf("expected task %q", taskKey)
90+
}
91+
}
8292
}
8393

8494
func TestGetCapacity(t *testing.T) {

pkg/resources/azure/azure.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ const (
4545
typeLoadBalancer = "LoadBalancer"
4646
typePublicIPAddress = "PublicIPAddress"
4747
typeNatGateway = "NatGateway"
48+
typeManagedIdentity = "ManagedIdentity"
4849
)
4950

5051
// ListResourcesAzure lists all resources for the cluster by quering Azure.
@@ -409,8 +410,33 @@ func (g *resourceGetter) listVMScaleSetsAndRoleAssignments(ctx context.Context)
409410
}
410411
rs = append(rs, r)
411412

412-
principalIDs[*vmss.Identity.PrincipalID] = vmss
413+
if vmss.Identity != nil {
414+
// Collect principal IDs from both system-assigned and user-assigned identities.
415+
if vmss.Identity.PrincipalID != nil {
416+
principalIDs[*vmss.Identity.PrincipalID] = vmss
417+
}
418+
for _, uai := range vmss.Identity.UserAssignedIdentities {
419+
if uai != nil && uai.PrincipalID != nil {
420+
principalIDs[*uai.PrincipalID] = vmss
421+
}
422+
}
423+
}
424+
}
425+
426+
// Collect VMSS IDs so that managed identities are not deleted before all VMSS are gone.
427+
var blocked []string
428+
for _, r := range rs {
429+
if r.Type == typeVMScaleSet {
430+
blocked = append(blocked, toKey(typeVMScaleSet, r.ID))
431+
}
432+
}
433+
434+
// Also list and delete managed identities owned by the cluster.
435+
miResources, err := g.listManagedIdentities(ctx, blocked)
436+
if err != nil {
437+
return nil, err
413438
}
439+
rs = append(rs, miResources...)
414440

415441
resourceGroupRAs, err := g.listRoleAssignments(ctx, principalIDs, g.resourceGroupID())
416442
if err != nil {
@@ -751,6 +777,32 @@ func (g *resourceGetter) deleteNatGateway(_ fi.Cloud, r *resources.Resource) err
751777
return g.cloud.NatGateway().Delete(context.TODO(), g.resourceGroupName(), r.Name)
752778
}
753779

780+
func (g *resourceGetter) listManagedIdentities(ctx context.Context, blocked []string) ([]*resources.Resource, error) {
781+
mis, err := g.cloud.ManagedIdentity().List(ctx, g.resourceGroupName())
782+
if err != nil {
783+
return nil, err
784+
}
785+
786+
var rs []*resources.Resource
787+
for _, mi := range mis {
788+
if !g.isOwnedByCluster(mi.Tags) {
789+
continue
790+
}
791+
rs = append(rs, &resources.Resource{
792+
Obj: mi,
793+
Type: typeManagedIdentity,
794+
ID: *mi.ID,
795+
Name: *mi.Name,
796+
Deleter: func(_ fi.Cloud, r *resources.Resource) error {
797+
return g.cloud.ManagedIdentity().Delete(context.TODO(), g.resourceGroupName(), r.Name)
798+
},
799+
Blocks: []string{toKey(typeResourceGroup, g.resourceGroupID())},
800+
Blocked: blocked,
801+
})
802+
}
803+
return rs, nil
804+
}
805+
754806
// isOwnedByCluster returns true if the resource is owned by the cluster.
755807
func (g *resourceGetter) isOwnedByCluster(tags map[string]*string) bool {
756808
for k, v := range tags {

0 commit comments

Comments
 (0)