Skip to content

Commit cc7998a

Browse files
authored
Merge pull request #345 from docker/fix/database-migration-race-condition
Database migration race condition
2 parents f7c2474 + 5f53d81 commit cc7998a

File tree

18 files changed

+1574
-12
lines changed

18 files changed

+1574
-12
lines changed

docs/database/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Docker MCP Gateway Database
2+
3+
## Overview
4+
5+
The Docker MCP Gateway uses an embedded SQLite database to store configuration data, profiles (working sets), server catalogs, and migration status. The database is located at `~/.docker/mcp/mcp-toolkit.db` and is automatically created and migrated when the CLI first runs.
6+
7+
This document covers the database architecture, migration system, concurrent access handling, and troubleshooting.
8+
9+
## Architecture
10+
11+
### Database Location
12+
13+
```
14+
~/.docker/mcp/
15+
├── mcp-toolkit.db # Main SQLite database
16+
├── .mcp-toolkit-migration.lock # Migration lock file (persists after creation)
17+
```
18+
19+
### Database Schema
20+
21+
The database schema is managed through versioned migrations in `pkg/db/migrations/`
22+
23+
### Key Components
24+
25+
**Database Package** (`pkg/db/`)
26+
- DAO interface for all database operations
27+
- Migration management with golang-migrate
28+
- Connection pooling and configuration
29+
- File locking for concurrent access safety
30+
31+
32+
## Migration System
33+
34+
### How Migrations Work
35+
36+
The Docker MCP Gateway uses [golang-migrate](https://github.com/golang-migrate/migrate) for schema versioning:
37+
38+
```
39+
Application Startup
40+
41+
pkg/db/New() called
42+
43+
Check for pending migrations
44+
45+
[Acquire File Lock] ← Prevents concurrent migrations
46+
47+
Run migrations sequentially
48+
49+
[Release File Lock]
50+
51+
Database ready for use
52+
```
53+
54+
### Migration Lock File
55+
56+
To prevent race conditions when multiple processes start simultaneously, migrations use an OS-level file lock:
57+
58+
- **Lock File**: `~/.docker/mcp/.mcp-toolkit-migration.lock`
59+
- **Library**: `github.com/gofrs/flock` (cross-platform)
60+
- **Persistence**: The lock file persists on disk after migrations complete (this is intentional and safe)
61+
62+
### Migration Operations
63+
64+
Migrations use both SQLite locks AND file locks:
65+
- File lock prevents multiple processes from attempting migrations
66+
- SQLite locks protect individual migration transactions
67+
- This two-level locking ensures safety during initialization
68+
69+
### Why File Locking Is Necessary
70+
71+
Migrations with [golang-migrate](https://github.com/golang-migrate/migrate) are not run within a single transaction. There is the possibility
72+
for a race condition if two process concurrently attempt to run the migrations.
73+
74+
**The Problem**: When multiple processes start simultaneously:
75+
1. All check the database state (clean)
76+
2. All try to run migrations at the same time
77+
3. Race condition occurs → "Dirty database version" error
78+
79+
**The Solution**: OS-level file locking ensures only one process runs migrations at a time. Other processes wait, then see migrations already complete.
80+
81+
82+
## Testing
83+
84+
### Concurrent Migration Test
85+
86+
Test that file locking prevents migration race conditions:
87+
88+
```bash
89+
go run test_migration_race.go
90+
```
91+
92+
This test launches 100 concurrent processes that all try to initialize the database simultaneously. With file locking, all 100 should succeed (100% success rate). Without file locking, most would fail with "Dirty database version" errors.
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"os/exec"
8+
"strings"
9+
"sync"
10+
"time"
11+
)
12+
13+
const (
14+
dbPath = "/Users/bobby/.docker/mcp/mcp-toolkit.db"
15+
iterations = 100
16+
)
17+
18+
type result struct {
19+
iteration int
20+
success bool
21+
output string
22+
err error
23+
duration time.Duration
24+
}
25+
26+
func main() {
27+
fmt.Printf("Starting migration race test with %d concurrent iterations...\n", iterations)
28+
29+
// Delete the database once at the beginning
30+
fmt.Printf("Deleting database at %s...\n", dbPath)
31+
if err := os.Remove(dbPath); err != nil && !os.IsNotExist(err) {
32+
fmt.Printf("Warning: failed to remove db: %v\n", err)
33+
} else {
34+
fmt.Println("Database deleted successfully")
35+
}
36+
37+
fmt.Println("\nLaunching all processes simultaneously...")
38+
39+
var wg sync.WaitGroup
40+
results := make(chan result, iterations)
41+
42+
startTime := time.Now()
43+
44+
// Launch all iterations concurrently - they will all race to initialize the database
45+
for i := range iterations {
46+
wg.Add(1)
47+
go func(iterNum int) {
48+
defer wg.Done()
49+
50+
iterStart := time.Now()
51+
52+
// Run docker mcp profile list - all will try to initialize/migrate at once
53+
ctx := context.Background()
54+
cmd := exec.CommandContext(ctx, "docker", "mcp", "profile", "list")
55+
output, err := cmd.CombinedOutput()
56+
57+
results <- result{
58+
iteration: iterNum,
59+
success: err == nil,
60+
output: string(output),
61+
err: err,
62+
duration: time.Since(iterStart),
63+
}
64+
}(i)
65+
}
66+
67+
// Wait for all goroutines to complete
68+
go func() {
69+
wg.Wait()
70+
close(results)
71+
}()
72+
73+
// Collect and analyze results
74+
var (
75+
successCount int
76+
failureCount int
77+
migrationErrors int
78+
dirtyDBErrors int
79+
)
80+
81+
fmt.Println("Results:")
82+
fmt.Println("--------")
83+
84+
for res := range results {
85+
if res.success {
86+
successCount++
87+
fmt.Printf("[%3d] ✓ Success (took %v)\n", res.iteration, res.duration)
88+
} else {
89+
failureCount++
90+
fmt.Printf("[%3d] ✗ FAILED (took %v)\n", res.iteration, res.duration)
91+
fmt.Printf(" Error: %v\n", res.err)
92+
93+
if res.output != "" {
94+
fmt.Printf(" Output: %s\n", res.output)
95+
}
96+
97+
// Check for specific error patterns
98+
outputStr := res.output
99+
if res.err != nil {
100+
outputStr += res.err.Error()
101+
}
102+
103+
if containsStr(outputStr, "failed to run migrations") {
104+
migrationErrors++
105+
}
106+
if containsStr(outputStr, "Dirty database version") {
107+
dirtyDBErrors++
108+
}
109+
}
110+
}
111+
112+
totalDuration := time.Since(startTime)
113+
114+
fmt.Println("\n" + strings.Repeat("=", 60))
115+
fmt.Println("Summary:")
116+
fmt.Println(strings.Repeat("=", 60))
117+
fmt.Printf("Total iterations: %d\n", iterations)
118+
fmt.Printf("Successful: %d (%.1f%%)\n", successCount, float64(successCount)/float64(iterations)*100)
119+
fmt.Printf("Failed: %d (%.1f%%)\n", failureCount, float64(failureCount)/float64(iterations)*100)
120+
fmt.Printf("Migration errors: %d\n", migrationErrors)
121+
fmt.Printf("Dirty DB errors: %d\n", dirtyDBErrors)
122+
fmt.Printf("Total time: %v\n", totalDuration)
123+
fmt.Printf("Avg time per iter: %v\n", totalDuration/time.Duration(iterations))
124+
125+
if dirtyDBErrors > 0 {
126+
fmt.Println("\n🎯 SUCCESS! Reproduced the 'Dirty database version' error!")
127+
} else if migrationErrors > 0 {
128+
fmt.Println("\n⚠️ Found migration errors but not the specific 'Dirty database' error")
129+
} else if failureCount > 0 {
130+
fmt.Println("\n⚠️ Found failures but not migration-related")
131+
} else {
132+
fmt.Println("\n✓ All iterations succeeded (could not reproduce the error)")
133+
}
134+
}
135+
136+
func containsStr(s, substr string) bool {
137+
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && contains(s, substr))
138+
}
139+
140+
func contains(s, substr string) bool {
141+
for i := 0; i <= len(s)-len(substr); i++ {
142+
if s[i:i+len(substr)] == substr {
143+
return true
144+
}
145+
}
146+
return false
147+
}

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
github.com/dop251/goja v0.0.0-20251008123653-cf18d89f3cf6
1616
github.com/fsnotify/fsnotify v1.9.0
1717
github.com/go-playground/validator/v10 v10.28.0
18+
github.com/gofrs/flock v0.13.0
1819
github.com/golang-migrate/migrate/v4 v4.19.0
1920
github.com/google/go-containerregistry v0.20.6
2021
github.com/google/jsonschema-go v0.3.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@ github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7Lk
366366
github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2 h1:ZpnhV/YsD2/4cESfV5+Hoeu/iUR3ruzNvZ+yQfO03a0=
367367
github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
368368
github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
369+
github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw=
370+
github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0=
369371
github.com/gogo/protobuf v1.0.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
370372
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
371373
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=

pkg/db/db.go

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package db
22

33
import (
4+
"context"
45
"database/sql"
56
"embed"
67
"errors"
@@ -9,13 +10,13 @@ import (
910
"path/filepath"
1011
"time"
1112

13+
"github.com/gofrs/flock"
1214
"github.com/golang-migrate/migrate/v4"
1315
msqlite "github.com/golang-migrate/migrate/v4/database/sqlite"
1416
"github.com/golang-migrate/migrate/v4/source/iofs"
1517
"github.com/jmoiron/sqlx"
1618

1719
"github.com/docker/mcp-gateway/pkg/log"
18-
"github.com/docker/mcp-gateway/pkg/retry"
1920
"github.com/docker/mcp-gateway/pkg/user"
2021

2122
// This enables to sqlite driver
@@ -94,18 +95,36 @@ func New(opts ...Option) (DAO, error) {
9495
return nil, err
9596
}
9697

97-
// Migrations are transactional for individual migrations, not for the entire migration process
98-
// A race condition can happen if two instances of the CLI try to run migrations at the same time
99-
// This doesn't harm the state of the database, but the second process will fail with an error
100-
// We work around this by retrying the migration up to 5 times.
101-
err = retry.Retry(5, 300*time.Millisecond, func() error {
102-
err := mig.Up()
103-
if err != nil && !errors.Is(err, migrate.ErrNoChange) {
104-
return err
105-
}
106-
return nil
107-
})
98+
// Use file locking to prevent concurrent migrations across processes
99+
// This ensures only one process can run migrations at a time, preventing
100+
// "Dirty database version" errors when multiple processes start simultaneously.
101+
// See docs/database/README.md
102+
//
103+
// Note: The lock file persists on disk after Unlock() - this is intentional.
104+
// flock.Unlock() only releases the lock and closes the file descriptor
105+
lockFile := filepath.Join(filepath.Dir(o.dbFile), ".mcp-toolkit-migration.lock")
106+
fileLock := flock.New(lockFile)
107+
108+
// Try to acquire the lock with a 5 second timeout
109+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
110+
defer cancel()
111+
112+
locked, err := fileLock.TryLockContext(ctx, 100*time.Millisecond)
108113
if err != nil {
114+
return nil, fmt.Errorf("failed to acquire migration lock: %w", err)
115+
}
116+
if !locked {
117+
return nil, fmt.Errorf("timeout waiting for migration lock")
118+
}
119+
defer func() {
120+
if err := fileLock.Unlock(); err != nil {
121+
log.Logf("failed to unlock migration lock: %v", err)
122+
}
123+
}()
124+
125+
// Now safely run migrations with the lock held
126+
err = mig.Up()
127+
if err != nil && !errors.Is(err, migrate.ErrNoChange) {
109128
return nil, fmt.Errorf("failed to run migrations: %w", err)
110129
}
111130

vendor/github.com/gofrs/flock/.gitignore

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)