Skip to content

Commit d2cb33b

Browse files
committed
gce: don't request migration on instance types that don't support it
GPU instance types in particular do not support live migration, and requesting it causes the instance creation to fail.
1 parent 5adae13 commit d2cb33b

File tree

2 files changed

+55
-20
lines changed

2 files changed

+55
-20
lines changed

upup/pkg/fi/cloudup/gcetasks/instance.go

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -190,21 +190,13 @@ func scopeToShortForm(s string) string {
190190
func (e *Instance) mapToGCE(project string, ipAddressResolver func(*Address) (*string, error)) (*compute.Instance, error) {
191191
zone := *e.Zone
192192

193-
var scheduling *compute.Scheduling
194-
if fi.ValueOf(e.Preemptible) {
195-
scheduling = &compute.Scheduling{
196-
OnHostMaintenance: "TERMINATE",
197-
Preemptible: true,
198-
}
199-
} else {
200-
scheduling = &compute.Scheduling{
201-
AutomaticRestart: fi.PtrTo(true),
202-
// TODO: Migrate or terminate?
203-
OnHostMaintenance: "MIGRATE",
204-
Preemptible: false,
205-
}
193+
machineTypeInfo, err := guessMachineTypeInfo(fi.ValueOf(e.MachineType))
194+
if err != nil {
195+
return nil, fmt.Errorf("getting machine type info: %w", err)
206196
}
207197

198+
scheduling := buildScheduling(machineTypeInfo, e.Preemptible, nil /* e.GCPProvisioningModel */, nil /* e.GuestAccelerators*/)
199+
208200
var disks []*compute.AttachedDisk
209201
disks = append(disks, &compute.AttachedDisk{
210202
InitializeParams: &compute.AttachedDiskInitializeParams{

upup/pkg/fi/cloudup/gcetasks/instancetemplate.go

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -257,32 +257,75 @@ func (*InstanceTemplate) CheckChanges(a, e, changes *InstanceTemplate) error {
257257
return nil
258258
}
259259

260-
func (e *InstanceTemplate) mapToGCE(project string, region string) (*compute.InstanceTemplate, error) {
261-
// TODO: This is similar to Instance...
260+
type MachineTypeInfo struct {
261+
SupportsMigration bool
262+
}
263+
264+
func buildScheduling(machineTypeInfo *MachineTypeInfo, preemptible *bool, gcpProvisioningModel *string, guestAccelerators []AcceleratorConfig) *compute.Scheduling {
262265
var scheduling *compute.Scheduling
263266

264-
if fi.ValueOf(e.Preemptible) {
267+
if fi.ValueOf(preemptible) {
265268
scheduling = &compute.Scheduling{
266269
AutomaticRestart: fi.PtrTo(false),
267270
OnHostMaintenance: "TERMINATE",
268-
ProvisioningModel: fi.ValueOf(e.GCPProvisioningModel),
271+
ProvisioningModel: fi.ValueOf(gcpProvisioningModel),
269272
Preemptible: true,
270273
}
271274
} else {
275+
// We default to allowing migration, as it gives higher uptime.
276+
// However, if we figure out that the instance does not support migration, we will set this to TERMINATE (so we can create the instance at all).
272277
scheduling = &compute.Scheduling{
273-
AutomaticRestart: fi.PtrTo(true),
274-
// TODO: Migrate or terminate?
278+
AutomaticRestart: fi.PtrTo(true),
275279
OnHostMaintenance: "MIGRATE",
276280
ProvisioningModel: "STANDARD",
277281
Preemptible: false,
278282
}
279283
}
280284

281-
if len(e.GuestAccelerators) > 0 {
285+
if len(guestAccelerators) > 0 {
282286
// Instances with accelerators cannot be migrated.
283287
scheduling.OnHostMaintenance = "TERMINATE"
284288
}
285289

290+
if machineTypeInfo != nil {
291+
if !machineTypeInfo.SupportsMigration {
292+
scheduling.OnHostMaintenance = "TERMINATE"
293+
}
294+
}
295+
return scheduling
296+
}
297+
298+
// guessMachineTypeInfo returns information about the machine type, such as whether it supports live migration.
299+
// We use this to determine the correct scheduling options for non-preemptible VMs.
300+
// If the machine type is not found, we return placeholder information, as we want to be tolerant of missing machine types, and just default to the safest scheduling options.
301+
func guessMachineTypeInfo(machineType string) (*MachineTypeInfo, error) {
302+
machineTypeInfo := &MachineTypeInfo{
303+
SupportsMigration: true,
304+
}
305+
if machineType == "" {
306+
return machineTypeInfo, nil
307+
}
308+
309+
family := strings.Split(machineType, "-")[0]
310+
311+
switch family {
312+
case "a4x", "a4", "a3", "a2", "g2", "g4":
313+
// VMs with GPUs attached do not support live migration.
314+
// https://docs.cloud.google.com/compute/docs/instances/live-migration-process#limitations
315+
machineTypeInfo.SupportsMigration = false
316+
}
317+
318+
return machineTypeInfo, nil
319+
}
320+
321+
func (e *InstanceTemplate) mapToGCE(project string, region string) (*compute.InstanceTemplate, error) {
322+
machineTypeInfo, err := guessMachineTypeInfo(fi.ValueOf(e.MachineType))
323+
if err != nil {
324+
return nil, fmt.Errorf("getting machine type info: %w", err)
325+
}
326+
327+
scheduling := buildScheduling(machineTypeInfo, e.Preemptible, e.GCPProvisioningModel, e.GuestAccelerators)
328+
286329
var disks []*compute.AttachedDisk
287330
disks = append(disks, &compute.AttachedDisk{
288331
Kind: "compute#attachedDisk",

0 commit comments

Comments
 (0)