Skip to content

Commit 3150726

Browse files
author
Shlomi Noach
authored
Merge pull request #370 from github/throttle-race-condition
Migration only starting after first replication lag metric collected
2 parents 86928dd + c807236 commit 3150726

3 files changed

Lines changed: 37 additions & 27 deletions

File tree

go/logic/migrator.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,9 @@ func (this *Migrator) initiateThrottler() error {
940940

941941
go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected)
942942
log.Infof("Waiting for first throttle metrics to be collected")
943-
<-this.firstThrottlingCollected
943+
<-this.firstThrottlingCollected // replication lag
944+
<-this.firstThrottlingCollected // other metrics
945+
log.Infof("First throttle metrics collected")
944946
go this.throttler.initiateThrottlerChecks()
945947

946948
return nil

go/logic/throttler.go

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -85,32 +85,37 @@ func (this *Throttler) parseChangelogHeartbeat(heartbeatValue string) (err error
8585
}
8686

8787
// collectReplicationLag reads the latest changelog heartbeat value
88-
func (this *Throttler) collectReplicationLag() {
89-
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
90-
for range ticker {
91-
go func() error {
92-
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
93-
return nil
94-
}
88+
func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- bool) {
89+
collectFunc := func() error {
90+
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
91+
return nil
92+
}
9593

96-
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
97-
// when running on replica, the heartbeat injection is also done on the replica.
98-
// This means we will always get a good heartbeat value.
99-
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
100-
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
101-
return log.Errore(err)
102-
} else {
103-
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
104-
}
94+
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
95+
// when running on replica, the heartbeat injection is also done on the replica.
96+
// This means we will always get a good heartbeat value.
97+
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
98+
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
99+
return log.Errore(err)
105100
} else {
106-
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
107-
return log.Errore(err)
108-
} else {
109-
this.parseChangelogHeartbeat(heartbeatValue)
110-
}
101+
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
111102
}
112-
return nil
113-
}()
103+
} else {
104+
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
105+
return log.Errore(err)
106+
} else {
107+
this.parseChangelogHeartbeat(heartbeatValue)
108+
}
109+
}
110+
return nil
111+
}
112+
113+
collectFunc()
114+
firstThrottlingCollected <- true
115+
116+
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
117+
for range ticker {
118+
go collectFunc()
114119
}
115120
}
116121

@@ -285,13 +290,14 @@ func (this *Throttler) collectGeneralThrottleMetrics() error {
285290
// that may affect throttling. There are several components, all running independently,
286291
// that collect such metrics.
287292
func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) {
288-
go this.collectReplicationLag()
293+
go this.collectReplicationLag(firstThrottlingCollected)
289294
go this.collectControlReplicasLag()
290295

291296
go func() {
292-
throttlerMetricsTick := time.Tick(1 * time.Second)
293297
this.collectGeneralThrottleMetrics()
294298
firstThrottlingCollected <- true
299+
300+
throttlerMetricsTick := time.Tick(1 * time.Second)
295301
for range throttlerMetricsTick {
296302
this.collectGeneralThrottleMetrics()
297303
}

go/mysql/utils.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.
3232
}
3333

3434
err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error {
35+
slaveIORunning := m.GetString("Slave_IO_Running")
36+
slaveSQLRunning := m.GetString("Slave_SQL_Running")
3537
secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master")
3638
if !secondsBehindMaster.Valid {
37-
return fmt.Errorf("replication not running")
39+
return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning)
3840
}
3941
replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second
4042
return nil

0 commit comments

Comments
 (0)