Skip to content

Commit 289ce46

Browse files
author
Shlomi Noach
authored
Merge branch 'master' into fix-infinite-cutover-loop
2 parents 8a681e1 + 8605463 commit 289ce46

11 files changed

Lines changed: 115 additions & 28 deletions

File tree

doc/cheatsheet.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,12 @@ gh-ost --allow-master-master --assume-master-host=a.specific.master.com
146146

147147
Topologies using _tungsten replicator_ are peculiar in that the participating servers are not actually aware they are replicating. The _tungsten replicator_ looks just like another app issuing queries on those hosts. `gh-ost` is unable to identify that a server participates in a _tungsten_ topology.
148148

149-
If you choose to migrate directly on master (see above), there's nothing special you need to do. If you choose to migrate via replica, then you must supply the identity of the master, and indicate this is a tungsten setup, as follows:
149+
If you choose to migrate directly on master (see above), there's nothing special you need to do.
150+
151+
If you choose to migrate via replica, then you need to make sure Tungsten is configured with log-slave-updates parameter (note this is different from MySQL's own log-slave-updates parameter), otherwise changes will not be in the replica's binlog, causing data to be corrupted after table swap. You must also supply the identity of the master, and indicate this is a tungsten setup, as follows:
150152

151153
```
152154
gh-ost --tungsten --assume-master-host=the.topology.master.com
153155
```
156+
157+
Also note that `--switch-to-rbr` does not work for a Tungsten setup as the replication process is external, so you need to make sure `binlog_format` is set to ROW before Tungsten Replicator connects to the server and starts applying events from the master.

doc/interactive-commands.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ Both interfaces may serve at the same time. Both respond to simple text command,
2626
- The `critical-load` format must be: `some_status=<numeric-threshold>[,some_status=<numeric-threshold>...]`'
2727
- For example: `Threads_running=1000,threads_connected=5000`, and you would then write/echo `critical-load=Threads_running=1000,threads_connected=5000` to the socket.
2828
- `nice-ratio=<ratio>`: change _nice_ ratio: 0 for aggressive (not nice, not sleeping), positive integer `n`:
29-
- For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping.
30-
- Examples: assume a single rows chunk copy takes `100ms` to complete.
31-
- `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following.
29+
- For any `1ms` spent copying rows, spend `n*1ms` units of time sleeping.
30+
- Examples: assume a single rows chunk copy takes `100ms` to complete.
31+
- `nice-ratio=0.5` will cause `gh-ost` to sleep for `50ms` immediately following.
3232
- `nice-ratio=1` will cause `gh-ost` to sleep for `100ms`, effectively doubling runtime
3333
- value of `2` will effectively triple the runtime; etc.
3434
- `throttle-query`: change throttle query
@@ -38,6 +38,10 @@ Both interfaces may serve at the same time. Both respond to simple text command,
3838
- `unpostpone`: at a time where `gh-ost` is postponing the [cut-over](cut-over.md) phase, instruct `gh-ost` to stop postponing and proceed immediately to cut-over.
3939
- `panic`: immediately panic and abort operation
4040

41+
### Querying for data
42+
43+
For commands that accept an argumetn as value, pass `?` (question mark) to _get_ current value rather than _set_ a new one.
44+
4145
### Examples
4246

4347
While migration is running:
@@ -63,6 +67,11 @@ $ echo "chunk-size=250" | nc -U /tmp/gh-ost.test.sample_data_0.sock
6367
# Serving on TCP port: 10001
6468
```
6569

70+
```shell
71+
$ echo "chunk-size=?" | nc -U /tmp/gh-ost.test.sample_data_0.sock
72+
250
73+
```
74+
6675
```shell
6776
$ echo throttle | nc -U /tmp/gh-ost.test.sample_data_0.sock
6877

go/base/context.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,9 @@ type MigrationContext struct {
135135
OriginalBinlogFormat string
136136
OriginalBinlogRowImage string
137137
InspectorConnectionConfig *mysql.ConnectionConfig
138+
InspectorMySQLVersion string
138139
ApplierConnectionConfig *mysql.ConnectionConfig
140+
ApplierMySQLVersion string
139141
StartTime time.Time
140142
RowCopyStartTime time.Time
141143
RowCopyEndTime time.Time
@@ -559,7 +561,11 @@ func (this *MigrationContext) GetControlReplicasLagResult() mysql.ReplicationLag
559561
func (this *MigrationContext) SetControlReplicasLagResult(lagResult *mysql.ReplicationLagResult) {
560562
this.throttleMutex.Lock()
561563
defer this.throttleMutex.Unlock()
562-
this.controlReplicasLagResult = *lagResult
564+
if lagResult == nil {
565+
this.controlReplicasLagResult = *mysql.NewNoReplicationLagResult()
566+
} else {
567+
this.controlReplicasLagResult = *lagResult
568+
}
563569
}
564570

565571
func (this *MigrationContext) GetThrottleControlReplicaKeys() *mysql.InstanceKeyMap {

go/cmd/gh-ost/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,5 +242,5 @@ func main() {
242242
migrator.ExecOnFailureHook()
243243
log.Fatale(err)
244244
}
245-
log.Info("Done")
245+
fmt.Fprintf(os.Stdout, "# Done\n")
246246
}

go/logic/applier.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,15 @@ func (this *Applier) InitDBConnections() (err error) {
7070
if err := this.readTableColumns(); err != nil {
7171
return err
7272
}
73+
log.Infof("Applier initiated on %+v, version %+v", this.connectionConfig.ImpliedKey, this.migrationContext.ApplierMySQLVersion)
7374
return nil
7475
}
7576

7677
// validateConnection issues a simple can-connect to MySQL
7778
func (this *Applier) validateConnection(db *gosql.DB) error {
78-
query := `select @@global.port`
79+
query := `select @@global.port, @@global.version`
7980
var port int
80-
if err := db.QueryRow(query).Scan(&port); err != nil {
81+
if err := db.QueryRow(query).Scan(&port, &this.migrationContext.ApplierMySQLVersion); err != nil {
8182
return err
8283
}
8384
if port != this.connectionConfig.Key.Port {

go/logic/inspect.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (this *Inspector) InitDBConnections() (err error) {
6060
if err := this.applyBinlogFormat(); err != nil {
6161
return err
6262
}
63+
log.Infof("Inspector initiated on %+v, version %+v", this.connectionConfig.ImpliedKey, this.migrationContext.InspectorMySQLVersion)
6364
return nil
6465
}
6566

@@ -168,9 +169,9 @@ func (this *Inspector) inspectOriginalAndGhostTables() (err error) {
168169

169170
// validateConnection issues a simple can-connect to MySQL
170171
func (this *Inspector) validateConnection() error {
171-
query := `select @@global.port`
172+
query := `select @@global.port, @@global.version`
172173
var port int
173-
if err := this.db.QueryRow(query).Scan(&port); err != nil {
174+
if err := this.db.QueryRow(query).Scan(&port, &this.migrationContext.InspectorMySQLVersion); err != nil {
174175
return err
175176
}
176177
if port != this.connectionConfig.Key.Port {

go/logic/migrator.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,12 @@ func (this *Migrator) printMigrationStatusHint(writers ...io.Writer) {
791791
throttleQuery,
792792
))
793793
}
794+
if throttleControlReplicaKeys := this.migrationContext.GetThrottleControlReplicaKeys(); throttleControlReplicaKeys.Len() > 0 {
795+
fmt.Fprintln(w, fmt.Sprintf("# throttle-control-replicas count: %+v",
796+
throttleControlReplicaKeys.Len(),
797+
))
798+
}
799+
794800
if this.migrationContext.PostponeCutOverFlagFile != "" {
795801
setIndicator := ""
796802
if base.FileExists(this.migrationContext.PostponeCutOverFlagFile) {
@@ -970,7 +976,9 @@ func (this *Migrator) initiateThrottler() error {
970976

971977
go this.throttler.initiateThrottlerCollection(this.firstThrottlingCollected)
972978
log.Infof("Waiting for first throttle metrics to be collected")
973-
<-this.firstThrottlingCollected
979+
<-this.firstThrottlingCollected // replication lag
980+
<-this.firstThrottlingCollected // other metrics
981+
log.Infof("First throttle metrics collected")
974982
go this.throttler.initiateThrottlerChecks()
975983

976984
return nil

go/logic/server.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func (this *Server) applyServerCommand(command string, writer *bufio.Writer) (pr
126126
if len(tokens) > 1 {
127127
arg = strings.TrimSpace(tokens[1])
128128
}
129-
129+
argIsQuestion := (arg == "?")
130130
throttleHint := "# Note: you may only throttle for as long as your binary logs are not purged\n"
131131

132132
if err := this.hooksExecutor.onInteractiveCommand(command); err != nil {
@@ -152,6 +152,7 @@ no-throttle # End forced throttling (other throttling m
152152
unpostpone # Bail out a cut-over postpone; proceed to cut-over
153153
panic # panic and quit without cleanup
154154
help # This message
155+
- use '?' (question mark) as argument to get info rather than set. e.g. "max-load=?" will just print out current max-load.
155156
`)
156157
}
157158
case "sup":
@@ -160,6 +161,10 @@ help # This message
160161
return ForcePrintStatusAndHintRule, nil
161162
case "chunk-size":
162163
{
164+
if argIsQuestion {
165+
fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.ChunkSize))
166+
return NoPrintStatusRule, nil
167+
}
163168
if chunkSize, err := strconv.Atoi(arg); err != nil {
164169
return NoPrintStatusRule, err
165170
} else {
@@ -169,6 +174,10 @@ help # This message
169174
}
170175
case "max-lag-millis":
171176
{
177+
if argIsQuestion {
178+
fmt.Fprintf(writer, "%+v\n", atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold))
179+
return NoPrintStatusRule, nil
180+
}
172181
if maxLagMillis, err := strconv.Atoi(arg); err != nil {
173182
return NoPrintStatusRule, err
174183
} else {
@@ -182,6 +191,10 @@ help # This message
182191
}
183192
case "nice-ratio":
184193
{
194+
if argIsQuestion {
195+
fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetNiceRatio())
196+
return NoPrintStatusRule, nil
197+
}
185198
if niceRatio, err := strconv.ParseFloat(arg, 64); err != nil {
186199
return NoPrintStatusRule, err
187200
} else {
@@ -191,26 +204,44 @@ help # This message
191204
}
192205
case "max-load":
193206
{
207+
if argIsQuestion {
208+
maxLoad := this.migrationContext.GetMaxLoad()
209+
fmt.Fprintf(writer, "%s\n", maxLoad.String())
210+
return NoPrintStatusRule, nil
211+
}
194212
if err := this.migrationContext.ReadMaxLoad(arg); err != nil {
195213
return NoPrintStatusRule, err
196214
}
197215
return ForcePrintStatusAndHintRule, nil
198216
}
199217
case "critical-load":
200218
{
219+
if argIsQuestion {
220+
criticalLoad := this.migrationContext.GetCriticalLoad()
221+
fmt.Fprintf(writer, "%s\n", criticalLoad.String())
222+
return NoPrintStatusRule, nil
223+
}
201224
if err := this.migrationContext.ReadCriticalLoad(arg); err != nil {
202225
return NoPrintStatusRule, err
203226
}
204227
return ForcePrintStatusAndHintRule, nil
205228
}
206229
case "throttle-query":
207230
{
231+
if argIsQuestion {
232+
fmt.Fprintf(writer, "%+v\n", this.migrationContext.GetThrottleQuery())
233+
return NoPrintStatusRule, nil
234+
}
208235
this.migrationContext.SetThrottleQuery(arg)
209236
fmt.Fprintf(writer, throttleHint)
210237
return ForcePrintStatusAndHintRule, nil
211238
}
212239
case "throttle-control-replicas":
213240
{
241+
if argIsQuestion {
242+
fmt.Fprintf(writer, "%s\n", this.migrationContext.GetThrottleControlReplicaKeys().ToCommaDelimitedList())
243+
return NoPrintStatusRule, nil
244+
}
214245
if err := this.migrationContext.ReadThrottleControlReplicaKeys(arg); err != nil {
215246
return NoPrintStatusRule, err
216247
}

go/logic/throttler.go

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,21 +84,38 @@ func (this *Throttler) parseChangelogHeartbeat(heartbeatValue string) (err error
8484
}
8585
}
8686

87-
// collectHeartbeat reads the latest changelog heartbeat value
88-
func (this *Throttler) collectHeartbeat() {
89-
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
90-
for range ticker {
91-
go func() error {
92-
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
93-
return nil
87+
// collectReplicationLag reads the latest changelog heartbeat value
88+
func (this *Throttler) collectReplicationLag(firstThrottlingCollected chan<- bool) {
89+
collectFunc := func() error {
90+
if atomic.LoadInt64(&this.migrationContext.CleanupImminentFlag) > 0 {
91+
return nil
92+
}
93+
94+
if this.migrationContext.TestOnReplica || this.migrationContext.MigrateOnReplica {
95+
// when running on replica, the heartbeat injection is also done on the replica.
96+
// This means we will always get a good heartbeat value.
97+
// When runnign on replica, we should instead check the `SHOW SLAVE STATUS` output.
98+
if lag, err := mysql.GetReplicationLag(this.inspector.connectionConfig); err != nil {
99+
return log.Errore(err)
100+
} else {
101+
atomic.StoreInt64(&this.migrationContext.CurrentLag, int64(lag))
94102
}
103+
} else {
95104
if heartbeatValue, err := this.inspector.readChangelogState("heartbeat"); err != nil {
96105
return log.Errore(err)
97106
} else {
98107
this.parseChangelogHeartbeat(heartbeatValue)
99108
}
100-
return nil
101-
}()
109+
}
110+
return nil
111+
}
112+
113+
collectFunc()
114+
firstThrottlingCollected <- true
115+
116+
ticker := time.Tick(time.Duration(this.migrationContext.HeartbeatIntervalMilliseconds) * time.Millisecond)
117+
for range ticker {
118+
go collectFunc()
102119
}
103120
}
104121

@@ -114,6 +131,7 @@ func (this *Throttler) collectControlReplicasLag() {
114131

115132
readReplicaLag := func(connectionConfig *mysql.ConnectionConfig) (lag time.Duration, err error) {
116133
dbUri := connectionConfig.GetDBUri("information_schema")
134+
117135
var heartbeatValue string
118136
if db, _, err := sqlutils.GetDB(dbUri); err != nil {
119137
return lag, err
@@ -158,9 +176,7 @@ func (this *Throttler) collectControlReplicasLag() {
158176
// No need to read lag
159177
return
160178
}
161-
if result := readControlReplicasLag(); result != nil {
162-
this.migrationContext.SetControlReplicasLagResult(result)
163-
}
179+
this.migrationContext.SetControlReplicasLagResult(readControlReplicasLag())
164180
}
165181
aggressiveTicker := time.Tick(100 * time.Millisecond)
166182
relaxedFactor := 10
@@ -272,13 +288,14 @@ func (this *Throttler) collectGeneralThrottleMetrics() error {
272288
// that may affect throttling. There are several components, all running independently,
273289
// that collect such metrics.
274290
func (this *Throttler) initiateThrottlerCollection(firstThrottlingCollected chan<- bool) {
275-
go this.collectHeartbeat()
291+
go this.collectReplicationLag(firstThrottlingCollected)
276292
go this.collectControlReplicasLag()
277293

278294
go func() {
279-
throttlerMetricsTick := time.Tick(1 * time.Second)
280295
this.collectGeneralThrottleMetrics()
281296
firstThrottlingCollected <- true
297+
298+
throttlerMetricsTick := time.Tick(1 * time.Second)
282299
for range throttlerMetricsTick {
283300
this.collectGeneralThrottleMetrics()
284301
}

go/mysql/utils.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ type ReplicationLagResult struct {
2222
Err error
2323
}
2424

25+
func NewNoReplicationLagResult() *ReplicationLagResult {
26+
return &ReplicationLagResult{Lag: 0, Err: nil}
27+
}
28+
29+
func (this *ReplicationLagResult) HasLag() bool {
30+
return this.Lag > 0
31+
}
32+
2533
// GetReplicationLag returns replication lag for a given connection config; either by explicit query
2634
// or via SHOW SLAVE STATUS
2735
func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.Duration, err error) {
@@ -32,9 +40,11 @@ func GetReplicationLag(connectionConfig *ConnectionConfig) (replicationLag time.
3240
}
3341

3442
err = sqlutils.QueryRowsMap(db, `show slave status`, func(m sqlutils.RowMap) error {
43+
slaveIORunning := m.GetString("Slave_IO_Running")
44+
slaveSQLRunning := m.GetString("Slave_SQL_Running")
3545
secondsBehindMaster := m.GetNullInt64("Seconds_Behind_Master")
3646
if !secondsBehindMaster.Valid {
37-
return fmt.Errorf("replication not running")
47+
return fmt.Errorf("replication not running; Slave_IO_Running=%+v, Slave_SQL_Running=%+v", slaveIORunning, slaveSQLRunning)
3848
}
3949
replicationLag = time.Duration(secondsBehindMaster.Int64) * time.Second
4050
return nil

0 commit comments

Comments
 (0)