diff --git a/Cargo.lock b/Cargo.lock index ee4de02216..3bdf7c4ae1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -844,12 +844,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.1", ] [[package]] @@ -1334,7 +1334,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.52.0", ] [[package]] @@ -1585,9 +1585,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.177" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libm" @@ -1628,6 +1628,12 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "litemap" version = "0.8.1" @@ -2463,6 +2469,18 @@ dependencies = [ "thiserror 2.0.17", ] +[[package]] +name = "reflink-copy" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13362233b147e57674c37b802d216b7c5e3dcccbed8967c84f0d8d223868ae27" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "rustix 1.1.4", + "windows", +] + [[package]] name = "regex" version = "1.12.2" @@ -2755,10 +2773,23 @@ dependencies = [ "bitflags 2.9.4", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.12", "windows-sys 0.52.0", ] +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags 2.9.4", + "errno", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.1", +] + [[package]] name = "rustls" version = "0.23.10" @@ -2931,6 +2962,7 @@ dependencies = [ "openssl", "predicates", "rand 0.8.5", + "reflink-copy", "regex", "reqsign 0.18.0", "reqwest", @@ -3419,7 +3451,7 @@ checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if 1.0.0", "fastrand", - "rustix", + "rustix 0.38.34", "windows-sys 0.52.0", ] @@ -3438,7 +3470,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix", + "rustix 0.38.34", "windows-sys 0.48.0", ] @@ -4179,7 +4211,7 @@ checksum = "b4ee928febd44d98f2f459a4a79bd4d928591333a494a10a868418ac1b39cf1f" dependencies = [ "either", "home", - "rustix", + "rustix 0.38.34", "winsafe", ] @@ -4214,6 +4246,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.62.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e6c4a1f363c8210c6f77ba24f645c61c6fb941eccf013da691f7e09515b8ac" +dependencies = [ + "windows-collections", + "windows-core 0.62.1", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "123e712f464a8a60ce1a13f4c446d2d43ab06464cb5842ff68f5c71b6fb7852e" +dependencies = [ + "windows-core 0.62.1", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -4223,6 +4276,52 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.62.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6844ee5416b285084d3d3fffd743b925a6c9385455f64f6d4fa3031c4c2749a9" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link 0.2.0", + "windows-result 0.4.0", + "windows-strings 0.5.0", +] + +[[package]] +name = "windows-future" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f3db6b24b120200d649cd4811b4947188ed3a8d2626f7075146c5d178a9a4a" +dependencies = [ + "windows-core 0.62.1", + "windows-link 0.2.0", + "windows-threading", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.1.3" @@ -4235,6 +4334,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" +[[package]] +name = "windows-numerics" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ce3498fe0aba81e62e477408383196b4b0363db5e0c27646f932676283b43d8" +dependencies = [ + "windows-core 0.62.1", + "windows-link 0.2.0", +] + [[package]] name = "windows-registry" version = "0.5.3" @@ -4242,8 +4351,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" dependencies = [ "windows-link 0.1.3", - "windows-result", - "windows-strings", + "windows-result 0.3.4", + "windows-strings 0.4.2", ] [[package]] @@ -4255,6 +4364,15 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-result" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows-strings" version = "0.4.2" @@ -4264,6 +4382,15 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-strings" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4331,6 +4458,15 @@ dependencies = [ "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-threading" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab47f085ad6932defa48855254c758cdd0e2f2d48e62a34118a268d8f345e118" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -4455,8 +4591,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "914566e6413e7fa959cc394fb30e563ba80f3541fbd40816d4c05a0fc3f2a0f1" dependencies = [ "libc", - "linux-raw-sys", - "rustix", + "linux-raw-sys 0.4.12", + "rustix 0.38.34", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index eb9a759985..8691e87599 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,6 +83,7 @@ object = "0.37" opendal = { version = "0.55.0", optional = true, default-features = false } openssl = { version = "0.10.75", optional = true } rand = "0.8.4" +reflink-copy = "0.1.24" regex = "1.10.3" reqsign = { version = "0.18.0", optional = true } reqwest = { version = "0.12", features = [ diff --git a/docs/Configuration.md b/docs/Configuration.md index 5a65173ef1..4abddb6c09 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -58,6 +58,11 @@ key_prefix = "" [cache.disk] dir = "/tmp/.cache/sccache" size = 7516192768 # 7 GiBytes +# Store cache entries uncompressed and restore them with filesystem reflinks +# (copy-on-write). Near-instant, near-zero-disk cache hits when the cache dir and +# the build dir share one CoW filesystem (Btrfs/XFS/APFS/ReFS); falls back to +# plain copies elsewhere. Default: false. See docs/FileClone.md. +file_clone = false # See the local docs on more explanations about this mode [cache.disk.preprocessor_cache_mode] @@ -224,6 +229,7 @@ export SCCACHE_MULTILEVEL_WRITE_ERROR_POLICY="all" * `SCCACHE_CACHE_SIZE` maximum size of the local on disk cache i.e. `2G` - default is 10G * `SCCACHE_DIRECT` enable/disable preprocessor caching (see [the local doc](Local.md)) * `SCCACHE_LOCAL_RW_MODE` the mode that the cache will operate in (`READ_ONLY` or `READ_WRITE`) +* `SCCACHE_FILE_CLONE` set to `true`/`on`/`1` to enable uncompressed, reflink (copy-on-write) cache storage (see [the file_clone doc](FileClone.md)). Default is `false`. #### s3 compatible diff --git a/docs/FileClone.md b/docs/FileClone.md new file mode 100644 index 0000000000..732e85540c --- /dev/null +++ b/docs/FileClone.md @@ -0,0 +1,237 @@ +# File clone (reflink / copy-on-write) disk cache + +`file_clone` is an **opt-in** mode for sccache's **local disk cache** that stores +cache entries **uncompressed** and restores them using filesystem **reflinks** +(copy-on-write, "CoW"): + +* `FICLONE` on Linux (Btrfs, XFS with reflink support, bcachefs, ...), +* `clonefile` on macOS (APFS), +* block cloning on Windows (ReFS / Dev Drives). + +On a copy-on-write filesystem a reflink makes the new file share the source +file's on-disk blocks until one of them is modified. This means: + +* **Cache writes are near-free** – when a compile misses, sccache reflinks the + freshly produced object files straight into the cache instead of reading and + zstd-compressing them. The cache entry shares blocks with your build tree. +* **Cache hits are near-instant** – on a hit, sccache reflinks the cached files + back out to your build tree. No decompression, no data copy. +* **The cache uses almost no extra disk** – because the cache entry, the original + build output, and every future restoration all share the same physical extents + until something modifies them. + +This implements the idea from issues +[#1053](https://github.com/mozilla/sccache/issues/1053) and +[#1174](https://github.com/mozilla/sccache/issues/1174) and PR +[#2640](https://github.com/mozilla/sccache/pull/2640) (credit to @quake), reusing +its good ideas (opt-in config, marker-based directory entries, mixed-format +reads) while fixing the bugs found in review. + +The default remains the compressed cache; nothing changes unless you opt in. + +## Enabling it + +Config file (`[cache.disk]`): + +```toml +[cache.disk] +file_clone = true +``` + +Or via environment variable: + +```bash +export SCCACHE_FILE_CLONE=true +``` + +Restart the sccache server after changing this (`sccache --stop-server`). + +## ⚠️ Same copy-on-write filesystem requirement + +To get the disk-saving and speed benefits, **both** of these must live on the +**same copy-on-write filesystem**: + +1. the sccache cache directory (`SCCACHE_DIR` / `[cache.disk] dir`), and +2. the directory where the compiler writes its output (your build tree). + +Reflinks cannot span filesystems. If the cache and the build tree are on +different filesystems, or on a filesystem without reflink support +(ext4, tmpfs, overlayfs, NTFS, ...), sccache transparently falls back to a plain +byte copy. In that case: + +* `file_clone` still avoids compression/decompression work, **but** +* restored artifacts are full copies (normal disk usage), and +* the cache stores entries **uncompressed**, so it will be **larger** on disk + than the default compressed cache. + +So on a non-CoW filesystem the default (compressed) cache is usually the better +choice. `file_clone` is for when your cache and build trees share a CoW volume. + +## Verifying that reflinks are happening + +`sccache --show-stats` reports two counters: + +``` +Objects restored by reflink 123 +Objects restored by copy 0 +``` + +* **Objects restored by reflink** – objects materialised by sharing blocks + (the fast/cheap CoW path). A non-zero value confirms reflinks are working. +* **Objects restored by copy** – objects that fell back to a byte copy because + reflinking wasn't possible (different filesystem, no CoW support, ...). + +The same numbers are available as `objects_reflinked` and +`objects_copied_fallback` in `sccache --show-stats --stats-format=json`. + +At server start, if `file_clone` is enabled but the cache directory's filesystem +does not support reflinks, sccache logs a warning so you know you'll only get the +copy-fallback behaviour. + +To *see* the block sharing on a CoW filesystem, note that plain `du` does **not** +reflect reflink/extent sharing (a reflinked file still reports its full +`st_blocks`, and the sharing is between `target/` and the *cache* directory, which +`du target/` can't see). On btrfs, use [`compsize`](https://github.com/kilobyte/compsize), +which reports the *actual* on-disk usage with shared extents counted once: + +```bash +# Measure the cache and the restored build tree TOGETHER. "Disk Usage" counts each +# physical extent once, so when the restore reflinks the cache it stays near one copy +# while "Referenced" (the logical size) is ~two copies. +compsize "$SCCACHE_DIR" target/ + +# The cleanest reflink proof is the *marginal* disk the restore adds: the on-disk +# usage of (cache + restore) minus that of the cache alone is ~0 when the restore +# reflinks, and ~the full restored size when it falls back to copying. This also +# cancels out any btrfs transparent compression, which affects both terms equally. +compsize "$SCCACHE_DIR" # cache only +compsize "$SCCACHE_DIR" target/ # cache + restored artifacts +``` + +The bundled `scripts/bench-file-clone.sh` automates this `compsize` comparison +across one or more projects and prints a markdown table. + +## Why reflinks are safe (unlike hardlinks) + +A natural alternative would be to hardlink cache entries into the build tree, but +that is unsafe: a hardlink and the cache entry are the *same* inode, so if a later +build step modifies the file **in place** (for example `strip`, `install -s`, or +an incremental linker), it corrupts the cached copy too. + +Reflinks do not have this problem. A reflink is copy-on-write: the cache entry and +the restored file start out sharing blocks, but the moment either one is written +to, the filesystem transparently forks the modified blocks. The cache copy is +never affected. This is why `file_clone` is the safe subset of the hardlink idea +from #1053 / #1174 and needs no read-only juggling. + +## On-disk format + +* **Compressed entries** (the default) are unchanged: a single file at + `{cache}/{c0}/{c1}/{key}`. Enabling or disabling `file_clone` does **not** + invalidate an existing compressed cache. +* **Uncompressed (`file_clone`) entries** are a directory at the same key path, + `{cache}/{c0}/{c1}/{key}/`, containing an `objects/` subdirectory with one plain + file per cached object (`objects/obj`, `objects/d`, ...), optional + `stdout`/`stderr` files, and a marker file `.sccache_dir_entry`. Objects are + namespaced under `objects/` so an object key can never collide with the reserved + `stdout`/`stderr`/marker names. The marker file also stores a small manifest of + each object's original output mode. + +Cache object files (and `stdout`/`stderr`/marker) are written `0600`, and the +entry and cache-root directories are kept user-private, matching the compressed +path (whose blobs are `0600` temp files). The *restored build output* still gets +its correct original mode, taken from the manifest — not from the private cache +copy. See "Security" below. + +Both formats coexist, so the two modes can be switched back and forth without +wiping the cache; lookups check for a directory entry first and fall back to the +compressed file. The first compressed write to a key that previously held a +directory entry (i.e. after turning `file_clone` *off*) transparently removes the +stale directory before writing the compressed file. Entries are written into a +temporary directory and atomically renamed into place, so concurrent builds never +observe a half-written entry. + +The preprocessor cache (a separate cache nested under the main cache directory) is +never stored as directory entries and is left completely untouched by +`file_clone`. + +## Security + +`file_clone` cache objects are stored uncompressed, so on Unix they are written +`0600` and the directories holding them are created `0700` regardless of the +process umask: the cache root, each entry directory, its `objects/` subdirectory, +and the immediate `{c0}/{c1}` parent. Because POSIX unlink/rename is governed by +the *parent directory's* write permission, the `0700` directories — not just the +`0600` files — are what prevent another user on a shared host from reading, +unlinking or replacing cached objects (cache poisoning → arbitrary code). + +When `file_clone` is enabled on a cache directory that is currently +group/other-accessible, sccache tightens the root to `0700` and logs a warning, +since this can lock out a genuinely shared cache. Keep the cache directory +user-private; do **not** point `SCCACHE_DIR` at a world/group-writable location +when using `file_clone` (the compressed cache already stores its blobs `0600`). +Cache-entry sources are opened without following symlinks on Linux as +defence-in-depth. + +## Caveats / limitations + +* **Multi-level caches**: `file_clone` only affects a *single-level local disk* + cache. A disk level used inside a `[cache.multilevel]` chain always stores + **compressed** entries (writes go through `put_raw`); the flag is not honoured + there. If a stray uncompressed entry is ever encountered at a multilevel disk + level, its `get_raw` returns `None`, so it is counted as a hit but never used as + a backfill source. Reflink-based storage at a multilevel L1 is out of scope. +* **Remote backends** (S3, Redis, GCS, ...) are unaffected; reflink is an + inherently local, same-filesystem concept. +* On a non-CoW filesystem the uncompressed cache is larger than the compressed + one; prefer the default there. +* `file_clone` does not change cache keys, so it is safe to toggle on and off + (mixed compressed/uncompressed entries coexist). + +## Benchmarking + +See `scripts/bench-file-clone.sh` for a self-contained tool that compares cold +builds, warm (compressed) rebuilds and warm (`file_clone`) rebuilds, and reports +cache sizes, restored-artifact disk usage, and reflink/copy counts. Run it with: + +```bash +# Offline C project target (no network): +scripts/bench-file-clone.sh + +# Also benchmark real cargo projects (needs network): +BENCH_REPOS="ripgrep=https://github.com/BurntSushi/ripgrep \ + fd=https://github.com/sharkdp/fd \ + bat=https://github.com/sharkdp/bat" \ + scripts/bench-file-clone.sh +``` + +It is a manual performance tool and is intentionally **not** wired into CI. + +### Example results + +Measured on a Btrfs (copy-on-write) filesystem with a debug `sccache`. Times in +seconds; sizes in KiB. `cache+restore on disk` is the [`compsize`](https://github.com/kilobyte/compsize) +disk usage of the file_clone cache and the restored artifacts together (shared extents +counted once); `restore marginal disk` is that minus the cache's own on-disk usage — the +NEW disk a restore consumes, which is ~0 when the artifacts reflink the cache. `reflink/copy` +is `objects_reflinked`/`objects_copied_fallback`. + +| target | cold | warm (compressed) | warm (file_clone) | compressed cache | file_clone cache | restored (logical) | cache+restore on disk | restore marginal disk | reflink/copy | +|---------|------:|------------------:|------------------:|-----------------:|-----------------:|-------------------:|----------------------:|----------------------:|:-----------:| +| local-c | 2.33 | 0.23 | 0.22 | 2400 | 8160 | 6899 | 2069 | 0 | 120/0 | +| ripgrep | 6.77 | 4.76 | 4.33 | 29404 | 111320 | 349005 | 152064 | 118329 | 75/0 | +| fd | 38.51 | 15.54 | 14.58 | 91044 | 356280 | 350839 | 164360 | 59556 | 434/0 | +| bat | 12.35 | 10.63 | 7.81 | 144016 | 514384 | 925313 | 317167 | 155235 | 758/0 | + +For comparison, the *compressed* cache's `restore marginal disk` (the same compsize +measurement, run against the compressed cache) is much higher — local-c 1920, ripgrep +152056, fd 118952, bat 313963 KiB — because a compressed-cache restore writes fresh, +unshared blocks. The gap is the disk the reflink sharing saves on every restore. + +Notes: every object was reflinked (`copy = 0`) on this CoW filesystem; `file_clone` +warm rebuilds were as fast as or faster than the compressed cache (no decompression). +The compressed cache is smaller on disk (the trade-off). The offline `local-c` target — +a pure-compilation workload with no link/bookkeeping step — restores with `restore +marginal disk = 0`, i.e. the restored artifacts share **all** their blocks with the +cache. For the cargo targets the small marginal remainder is the freshly linked binary +and cargo's fingerprint/incremental files, which sccache does not cache. diff --git a/docs/Local.md b/docs/Local.md index f1b4543ca7..c9f0a1d95b 100644 --- a/docs/Local.md +++ b/docs/Local.md @@ -6,6 +6,12 @@ The default cache size is 10 gigabytes. To change this, set `SCCACHE_CACHE_SIZE` The local storage only supports a single sccache server at a time. Multiple concurrent servers will race and cause spurious build failures. +## File clone (reflink / copy-on-write) mode + +By default the local cache stores each entry as a compressed (ZIP-of-zstd) file. Setting `file_clone = true` in `[cache.disk]` (or `SCCACHE_FILE_CLONE=true`) instead stores entries **uncompressed** and restores them with filesystem **reflinks** (copy-on-write). On a CoW filesystem (Btrfs, XFS with reflink, APFS, ReFS) this makes cache writes and cache hits near-instant and lets the cached artifacts share disk blocks with your build tree, so the cache adds almost no extra disk usage. + +For the full disk/speed benefit, the cache directory **and** the build directory must live on the **same** copy-on-write filesystem. On other filesystems `file_clone` still works (no decompression on read) but falls back to plain copies, which use the normal amount of disk. See [the file_clone doc](FileClone.md) for full details, including how to verify reflinks happened via `sccache --show-stats`. + ## Preprocessor cache mode This is inspired by [ccache's direct mode](https://ccache.dev/manual/3.7.9.html#_the_direct_mode) and works roughly the same. diff --git a/scripts/bench-file-clone.sh b/scripts/bench-file-clone.sh new file mode 100755 index 0000000000..c11bf39006 --- /dev/null +++ b/scripts/bench-file-clone.sh @@ -0,0 +1,384 @@ +#!/usr/bin/env bash +# ============================================================================= +# bench-file-clone.sh - Benchmark sccache's `file_clone` (reflink) disk cache. +# +# This is a MANUAL performance tool (not wired into CI). It compares, per target +# repository/project, three scenarios: +# +# * cold - a clean build with sccache disabled (no cache), +# * warm - a rebuild served entirely from the DEFAULT (compressed) disk cache, +# * clone - a rebuild served entirely from the `file_clone` (uncompressed +# reflink) disk cache. +# +# and reports, per target: +# +# * the wall-clock time of each scenario, +# * the on-disk size of the compressed vs. file_clone cache, +# * the *actual* disk space used by the cache and the restored build artifacts +# together, measured with `compsize` (the authoritative btrfs tool): when they +# share blocks via reflink, the combined on-disk usage stays near one copy while +# the logical "referenced" size is ~two copies, and the difference is the disk saved, and +# * how many restored objects were reflinked vs. copied (from `--show-stats`). +# +# The methodology mirrors PR https://github.com/mozilla/sccache/pull/2640, which +# benchmarked ripgrep/fd/bat. +# +# IMPORTANT: the block-sharing / near-zero-disk benefit only materialises when the +# cache directory AND the build directory live on the SAME copy-on-write filesystem +# (Btrfs, XFS w/ reflink, APFS, ReFS). On other filesystems file_clone still works +# (no decompression on read) but falls back to plain copies, so disk usage will not +# shrink. The script prints which case it observed. +# +# Usage: +# scripts/bench-file-clone.sh [extra cargo repos ...] +# +# Environment: +# SCCACHE Path to the sccache binary. Default: build ./target/release/sccache. +# WORKDIR Scratch directory. Default: a fresh mktemp dir (removed on exit). +# C_FILES Number of generated C files for the offline target. Default: 120. +# BENCH_REPOS Space-separated "name=git-url" entries to additionally benchmark +# with `cargo build` (requires network + cargo). Example: +# BENCH_REPOS="ripgrep=https://github.com/BurntSushi/ripgrep \ +# fd=https://github.com/sharkdp/fd" \ +# scripts/bench-file-clone.sh +# +# The offline C target ("local-c") always runs and needs no network, so the tool +# can be verified end-to-end in an isolated environment. +# +# Platform: this script is Linux/GNU-oriented (it uses `du --apparent-size`, `df -Pk`, +# `stat -f -c`, `nproc`). On btrfs it uses `compsize` for the authoritative on-disk / +# disk-savings measurement; install it from https://github.com/kilobyte/compsize (or +# your distro's `compsize` package). It is a manual perf tool, not part of CI. +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ----- locate / build sccache -------------------------------------------------- +if [[ -z "${SCCACHE:-}" ]]; then + echo ">> Building sccache (release)..." + (cd "${REPO_ROOT}" && cargo build --release --bin sccache >/dev/null) + SCCACHE="${REPO_ROOT}/target/release/sccache" +fi +echo ">> Using sccache: ${SCCACHE}" +"${SCCACHE}" --version + +# ----- scratch dir ------------------------------------------------------------- +CLEANUP_WORKDIR=0 +if [[ -z "${WORKDIR:-}" ]]; then + WORKDIR="$(mktemp -d)" + CLEANUP_WORKDIR=1 +fi +mkdir -p "${WORKDIR}" +echo ">> Work directory: ${WORKDIR}" + +if ! command -v compsize >/dev/null 2>&1; then + echo ">> WARNING: 'compsize' not found; the disk-savings columns will show n/a." >&2 + echo ">> Install it for the authoritative btrfs measurement: https://github.com/kilobyte/compsize" >&2 +fi + +cleanup() { + "${SCCACHE}" --stop-server >/dev/null 2>&1 || true + if [[ "${CLEANUP_WORKDIR}" == "1" ]]; then + rm -rf "${WORKDIR}" + fi +} +trap cleanup EXIT + +C_FILES="${C_FILES:-120}" + +# Results, one markdown row per target. +declare -a RESULTS + +# Print wall-clock seconds (float) of running "$@". Aborts (non-zero exit) if the timed command +# fails, so a broken build can't masquerade as a fast "warm" time. +time_cmd() { + local start end status + start="$(date +%s.%N)" + if "$@" >/dev/null 2>&1; then + status=0 + else + status=$? + fi + end="$(date +%s.%N)" + if [ "${status}" -ne 0 ]; then + echo "ERROR: timed command failed (exit ${status}): $*" >&2 + return "${status}" + fi + awk -v s="${start}" -v e="${end}" 'BEGIN { printf "%.2f", e - s }' +} + +# Directory size in KiB (actual blocks used). +dir_kib() { + du -sk "$1" 2>/dev/null | awk '{print $1}' +} + +# `du` apparent size in KiB (logical size, ignores block sharing). +dir_apparent_kib() { + du -sk --apparent-size "$1" 2>/dev/null | awk '{print $1}' +} + +# Available space in KiB on the filesystem containing $1. +fs_avail_kib() { + df -Pk "$1" 2>/dev/null | awk 'NR==2 {print $4}' +} + +# Authoritative on-disk measurement via `compsize` (btrfs). Given one or more paths, prints +# "DISK_KIB REFERENCED_KIB": DISK = physical blocks actually allocated, with shared/reflinked +# extents counted ONCE; REFERENCED = logical bytes referenced, counting every reference. +# Measuring the cache and the restored artifacts together, DISK far below REFERENCED proves +# they share blocks via reflink, and REFERENCED-DISK is the disk saved. Empty if compsize is +# unavailable or a path is not on btrfs. +compsize_disk_ref() { + command -v compsize >/dev/null 2>&1 || return 0 + compsize -b "$@" 2>/dev/null \ + | awk '/^TOTAL/ { printf "%d %d", int($3 / 1024), int($5 / 1024) }' +} + +# Extract a top-level numeric stat from `sccache --show-stats --stats-format=json`. +stat_json() { + local dir="$1" key="$2" + SCCACHE_DIR="${dir}" "${SCCACHE}" --show-stats --stats-format=json 2>/dev/null \ + | python3 -c "import sys,json; print(json.load(sys.stdin).get('stats',{}).get('${key}',0))" 2>/dev/null \ + || echo 0 +} + +# Total cache hits across all languages (sum of cache_hits.counts). +cache_hits_total() { + local dir="$1" + SCCACHE_DIR="${dir}" "${SCCACHE}" --show-stats --stats-format=json 2>/dev/null \ + | python3 -c "import sys,json; print(sum(json.load(sys.stdin).get('stats',{}).get('cache_hits',{}).get('counts',{}).values()))" 2>/dev/null \ + || echo 0 +} + +start_server() { + local dir="$1"; shift + local extra_env=("$@") + "${SCCACHE}" --stop-server >/dev/null 2>&1 || true + env SCCACHE_DIR="${dir}" "${extra_env[@]}" "${SCCACHE}" --start-server >/dev/null 2>&1 +} + +stop_server() { + "${SCCACHE}" --stop-server >/dev/null 2>&1 || true +} + +# ----------------------------------------------------------------------------- +# Generate a self-contained C project (no network) with $C_FILES translation +# units plus a Makefile that compiles them through "$SCCACHE gcc". +# ----------------------------------------------------------------------------- +gen_c_project() { + local dir="$1" + rm -rf "${dir}" + mkdir -p "${dir}/src" + # Each unit gets a long, data-dependent arithmetic body so the optimizer has + # real work to do (otherwise sccache's per-call overhead dwarfs the compile and + # the warm build can look slower than the cold one). + local stmts="${C_FILE_STMTS:-600}" + local i j + for ((i = 0; i < C_FILES; i++)); do + { + cat < +#include +static uint64_t table_${i}[256]; +uint64_t compute_${i}(uint64_t x) { + uint64_t a = ${i}u, b = 0x9e3779b97f4a7c15ull ^ x, c = x; + for (size_t k = 0; k < 256; ++k) { + table_${i}[k] = (x ^ (k * 2654435761u)) + a; + a = (a << 7) ^ (a >> 3) ^ table_${i}[k]; + } +EOF + for ((j = 0; j < stmts; j++)); do + echo " a = a * 6364136223846793005ull + b; b = (b ^ (a >> 17)) + c; c = (c << 5) - a;" + done + echo " return a ^ b ^ c;" + echo "}" + } >"${dir}/src/mod_${i}.c" + done + { + echo "CC ?= gcc" + echo "SCCACHE ?= sccache" + echo "CFLAGS ?= -O2 -g" + printf 'OBJS =' + for ((i = 0; i < C_FILES; i++)); do printf ' build/mod_%d.o' "${i}"; done + echo + echo + echo "all: \$(OBJS)" + echo + echo "build/%.o: src/%.c" + echo " @mkdir -p build" + echo " \$(SCCACHE) \$(CC) \$(CFLAGS) -c \$< -o \$@" + echo + echo "clean:" + echo " rm -rf build" + } >"${dir}/Makefile" +} + +# ----------------------------------------------------------------------------- +# Benchmark one target. +# $1 = display name +# $2 = project directory +# $3 = cold build command (sccache disabled) +# $4 = warm build command (uses $SCCACHE for the compiler) +# $5 = clean command +# $6 = build output subdirectory (for measuring restored-artifact disk usage) +# ----------------------------------------------------------------------------- +bench_target() { + local name="$1" proj="$2" cold_cmd="$3" warm_cmd="$4" clean_cmd="$5" out_subdir="$6" + echo + echo "============================================================" + echo ">> Target: ${name}" + echo "============================================================" + + local comp_cache="${WORKDIR}/cache-compressed-${name}" + local clone_cache="${WORKDIR}/cache-fileclone-${name}" + rm -rf "${comp_cache}" "${clone_cache}" + + # ---- cold build (no sccache) ---- + (cd "${proj}" && eval "${clean_cmd}") >/dev/null 2>&1 || true + stop_server + local cold + cold="$(cd "${proj}" && time_cmd bash -c "${cold_cmd}")" + echo " cold (no cache): ${cold}s" + + local out_dir="${proj}/${out_subdir}" + + # ---- warm build, compressed cache ---- + start_server "${comp_cache}" + (cd "${proj}" && eval "${clean_cmd}") >/dev/null 2>&1 || true + (cd "${proj}" && eval "${warm_cmd}") >/dev/null 2>&1 # populate (miss) + (cd "${proj}" && eval "${clean_cmd}") >/dev/null 2>&1 || true + sync + local avail_before_comp + avail_before_comp="$(fs_avail_kib "${proj}")" + local warm + warm="$(cd "${proj}" && time_cmd bash -c "${warm_cmd}")" # hit + sync + local comp_restore_delta=$(( avail_before_comp - $(fs_avail_kib "${proj}") )) + local comp_size + comp_size="$(dir_kib "${comp_cache}")" + # compsize on btrfs: the compressed cache and the restored artifacts share no blocks, so + # restoring adds ~the full restored size of new disk (comp_marginal = union disk - cache disk). + local comp_cs comp_cache_cs comp_disk="n/a" comp_marginal="n/a" + comp_cs="$(compsize_disk_ref "${comp_cache}" "${out_dir}")" + comp_cache_cs="$(compsize_disk_ref "${comp_cache}")" + if [[ -n "${comp_cs}" && -n "${comp_cache_cs}" ]]; then + comp_disk="${comp_cs%% *}" + comp_marginal=$(( comp_disk - ${comp_cache_cs%% *} )) + fi + # The warm run must be a genuine cache hit, otherwise the time is meaningless. + if [ "$(cache_hits_total "${comp_cache}")" -eq 0 ]; then + echo "ERROR: compressed warm build for ${name} produced no cache hits" >&2 + stop_server + exit 1 + fi + echo " warm (compressed): ${warm}s cache=${comp_size} KiB df-delta=${comp_restore_delta} KiB compsize: cache+restore on-disk=${comp_disk} KiB restore-marginal=${comp_marginal} KiB" + stop_server + + # ---- warm build, file_clone cache ---- + start_server "${clone_cache}" SCCACHE_FILE_CLONE=true + (cd "${proj}" && eval "${clean_cmd}") >/dev/null 2>&1 || true + (cd "${proj}" && eval "${warm_cmd}") >/dev/null 2>&1 # populate (miss) + (cd "${proj}" && eval "${clean_cmd}") >/dev/null 2>&1 || true + sync + local avail_before_clone + avail_before_clone="$(fs_avail_kib "${proj}")" + local clone + clone="$(cd "${proj}" && time_cmd bash -c "${warm_cmd}")" # hit (reflink/copy) + sync + local clone_restore_delta=$(( avail_before_clone - $(fs_avail_kib "${proj}") )) + local clone_size + clone_size="$(dir_kib "${clone_cache}")" + + # Logical (apparent) size of the restored artifacts, for reference. + local restored_apparent + restored_apparent="$(dir_apparent_kib "${out_dir}")" + + # Authoritative disk-savings measurement on btrfs via compsize. Measuring the file_clone + # cache and the restored artifacts TOGETHER counts shared (reflinked) extents once, so the + # marginal disk the restore adds = compsize_disk(cache+restore) - compsize_disk(cache) is + # ~0 when (and only when) the restore reflinks the cache. This isolates block sharing from + # any btrfs transparent compression, which affects both terms equally. + local clone_cs clone_cache_cs clone_disk="n/a" clone_ref="n/a" clone_cache_disk="n/a" restore_marginal="n/a" + clone_cs="$(compsize_disk_ref "${clone_cache}" "${out_dir}")" + clone_cache_cs="$(compsize_disk_ref "${clone_cache}")" + if [[ -n "${clone_cs}" && -n "${clone_cache_cs}" ]]; then + clone_disk="${clone_cs%% *}" + clone_ref="${clone_cs##* }" + clone_cache_disk="${clone_cache_cs%% *}" + restore_marginal=$(( clone_disk - clone_cache_disk )) + fi + + # The warm run must be a genuine cache hit. + if [ "$(cache_hits_total "${clone_cache}")" -eq 0 ]; then + echo "ERROR: file_clone warm build for ${name} produced no cache hits" >&2 + stop_server + exit 1 + fi + # Reflink vs copy counters from the file_clone server. + local reflinked copied + reflinked="$(stat_json "${clone_cache}" objects_reflinked)" + copied="$(stat_json "${clone_cache}" objects_copied_fallback)" + stop_server + + echo " warm (file_clone): ${clone}s cache=${clone_size} KiB df-delta=${clone_restore_delta} KiB" + echo " restored artifacts: logical=${restored_apparent} KiB" + echo " compsize (cache+restore): on-disk=${clone_disk} KiB referenced=${clone_ref} KiB (cache-only on-disk=${clone_cache_disk} KiB)" + echo " restore marginal disk: ${restore_marginal} KiB (~0 => restore reflinks the cache; compressed was ${comp_marginal} KiB)" + echo " reflinked objects: ${reflinked} copied (fallback): ${copied}" + + RESULTS+=("| ${name} | ${cold} | ${warm} | ${clone} | ${comp_size} | ${clone_size} | ${restored_apparent} | ${clone_disk} | ${restore_marginal} | ${reflinked}/${copied} |") +} + +# ----- offline C target (always) ---------------------------------------------- +C_PROJ="${WORKDIR}/local-c" +gen_c_project "${C_PROJ}" +bench_target "local-c" "${C_PROJ}" \ + 'make -j"$(nproc)" SCCACHE=' \ + 'make -j"$(nproc)" SCCACHE="$SCCACHE"' \ + 'make clean' \ + "build" + +# ----- optional cargo repos (network) ----------------------------------------- +REPO_SPECS=("$@") +if [[ -n "${BENCH_REPOS:-}" ]]; then + # shellcheck disable=SC2206 + REPO_SPECS+=(${BENCH_REPOS}) +fi +for spec in "${REPO_SPECS[@]:-}"; do + [[ -z "${spec}" ]] && continue + name="${spec%%=*}" + url="${spec#*=}" + echo + echo ">> Cloning ${name} from ${url} ..." + if ! git clone --depth 1 "${url}" "${WORKDIR}/${name}" >/dev/null 2>&1; then + echo " !! clone failed (offline?), skipping ${name}" + continue + fi + # Use RUSTC_WRAPPER so cargo routes rustc through sccache. + bench_target "${name}" "${WORKDIR}/${name}" \ + 'cargo build' \ + 'RUSTC_WRAPPER="$SCCACHE" cargo build' \ + 'cargo clean' \ + "target/debug" +done + +# ----- markdown summary ------------------------------------------------------- +echo +echo "## file_clone benchmark results" +echo +echo "Filesystem of work dir: $(stat -f -c '%T' "${WORKDIR}" 2>/dev/null || echo unknown)" +echo +echo "Times in seconds; sizes in KiB. 'cache+restore on disk' is the compsize disk usage of" +echo "the file_clone cache and the restored artifacts together (shared/reflinked extents counted" +echo "once). 'restore marginal disk' = that minus the compsize disk of the cache alone, i.e. the" +echo "NEW disk the restore consumes; ~0 means the restored artifacts reflink the cache (this" +echo "isolates block sharing from any btrfs transparent compression). 'reflink/copy' =" +echo "objects_reflinked/objects_copied_fallback from --show-stats (the proof CoW engaged)." +echo +echo "| target | cold | warm (compressed) | warm (file_clone) | compressed cache | file_clone cache | restored (logical) | cache+restore on disk | restore marginal disk | reflink/copy |" +echo "|--------|-----:|------------------:|------------------:|-----------------:|-----------------:|-------------------:|----------------------:|----------------------:|:------------:|" +for row in "${RESULTS[@]}"; do + echo "${row}" +done diff --git a/src/cache/cache.rs b/src/cache/cache.rs index f0b79eca63..67551363a7 100644 --- a/src/cache/cache.rs +++ b/src/cache/cache.rs @@ -75,6 +75,22 @@ pub trait Storage: Send + Sync { /// finished. async fn put(&self, key: &str, entry: CacheWrite) -> Result; + /// Store freshly produced compiler `objects` (plus `stdout`/`stderr`) under `key`. The default + /// zips+zstds them via `put`; the disk cache overrides it to reflink originals for `file_clone`. + async fn put_objects( + &self, + key: &str, + objects: Vec, + stdout: Vec, + stderr: Vec, + pool: &tokio::runtime::Handle, + ) -> Result { + let mut entry = CacheWrite::from_objects(objects, pool).await?; + entry.put_stdout(&stdout)?; + entry.put_stderr(&stderr)?; + self.put(key, entry).await + } + /// Get raw serialized cache entry bytes by `key` (for multi-level backfill). /// Returns `None` if the entry is not found, or if the implementation doesn't support raw access. /// This is used by multi-level caches to backfill faster levels. @@ -589,6 +605,7 @@ pub fn storage_from_config( preprocessor_cache_mode_config, rw_mode, config.basedirs.clone(), + config.fallback_cache.file_clone, ))) } diff --git a/src/cache/cache_io.rs b/src/cache/cache_io.rs index 9e29743e90..d4eddf3576 100644 --- a/src/cache/cache_io.rs +++ b/src/cache/cache_io.rs @@ -12,14 +12,78 @@ use super::utils::{get_file_mode, set_file_mode}; use crate::errors::*; +use crate::lru_disk_cache::DIR_ENTRY_MARKER; use fs_err as fs; +use std::collections::HashMap; +use std::ffi::OsStr; use std::fmt; use std::io::{Cursor, Read, Seek, Write}; -use std::path::{Path, PathBuf}; +use std::path::{Component, Path, PathBuf}; use tempfile::NamedTempFile; use zip::write::FileOptions; use zip::{CompressionMethod, ZipArchive, ZipWriter}; +/// Subdirectory inside a `file_clone` entry holding per-object files, so object keys can't collide +/// with the reserved `stdout`/`stderr`/marker names. +pub(crate) const OBJECTS_SUBDIR: &str = "objects"; + +/// Validate that an object key is a single normal path component (no separators/`..`/NUL), so it +/// can't escape the `objects/` directory via traversal. +pub(crate) fn validate_object_key(key: &str) -> Result<()> { + if key.is_empty() || key.contains('\0') { + bail!("invalid cache object key {:?}", key); + } + let mut components = Path::new(key).components(); + match (components.next(), components.next()) { + (Some(Component::Normal(c)), None) if c == OsStr::new(key) => Ok(()), + _ => bail!("cache object key {:?} is not a single path component", key), + } +} + +/// Serialize the object-key → unix-mode map into the marker file as NUL-separated `\0\0` +/// records (kept out-of-band so cache objects can stay `0600`). +pub(crate) fn serialize_mode_manifest(modes: &[(String, u32)]) -> Vec { + let mut out = Vec::new(); + for (key, mode) in modes { + out.extend_from_slice(mode.to_string().as_bytes()); + out.push(0); + out.extend_from_slice(key.as_bytes()); + out.push(0); + } + out +} + +/// Parse a marker file produced by [`serialize_mode_manifest`], tolerating empty/corrupt records. +pub(crate) fn parse_mode_manifest(bytes: &[u8]) -> HashMap { + let mut map = HashMap::new(); + let mut fields = bytes.split(|&b| b == 0); + while let (Some(mode_b), Some(key_b)) = (fields.next(), fields.next()) { + if let (Ok(mode_s), Ok(key_s)) = (std::str::from_utf8(mode_b), std::str::from_utf8(key_b)) { + if let Ok(mode) = mode_s.parse::() { + map.insert(key_s.to_owned(), mode); + } + } + } + map +} + +/// Counts of how cache objects were restored (reflinked vs copied), for `--show-stats`. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub struct ExtractionStats { + pub objects_reflinked: u64, + pub objects_copied: u64, +} + +impl ExtractionStats { + fn record(&mut self, outcome: crate::reflink::ReflinkOutcome) { + if outcome.reflinked() { + self.objects_reflinked += 1; + } else { + self.objects_copied += 1; + } + } +} + /// Cache object sourced by a file. #[derive(Clone)] pub struct FileObjectSource { @@ -34,8 +98,10 @@ pub struct FileObjectSource { /// Result of a cache lookup. pub enum Cache { - /// Result was found in cache. + /// Result was found in cache (compressed ZIP-of-zstd format). Hit(CacheRead), + /// Result was found in cache (uncompressed directory format, `file_clone` mode). + UncompressedHit(UncompressedCacheEntry), /// Result was not found in cache. Miss, /// Do not cache the results of the compilation. @@ -48,6 +114,7 @@ impl fmt::Debug for Cache { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Cache::Hit(_) => write!(f, "Cache::Hit(...)"), + Cache::UncompressedHit(_) => write!(f, "Cache::UncompressedHit(...)"), Cache::Miss => write!(f, "Cache::Miss"), Cache::None => write!(f, "Cache::None"), Cache::Recache => write!(f, "Cache::Recache"), @@ -132,7 +199,7 @@ impl CacheRead { mut self, objects: T, pool: &tokio::runtime::Handle, - ) -> Result<()> + ) -> Result where T: IntoIterator + Send + Sync + 'static, { @@ -195,19 +262,19 @@ impl CacheRead { (Err(_), true) => continue, } } - Ok(()) + Ok(ExtractionStats::default()) }) .await? } } #[cfg(unix)] -fn is_path_null(path: &Path) -> bool { +pub(crate) fn is_path_null(path: &Path) -> bool { path == Path::new("/dev/null") } #[cfg(windows)] -fn is_path_null(path: &Path) -> bool { +pub(crate) fn is_path_null(path: &Path) -> bool { // For Windows, it appears that `NUL` with whatever extension is also a blackhole // (at least for `CreateFileX`), so it does not suffice to check for an exact match // Also note that gcc, cl.exe, et al. append a correct extension automatically even @@ -318,6 +385,118 @@ impl Default for CacheWrite { } } +/// An uncompressed (`file_clone`) cache entry: a directory with an `objects/` subdir, optional +/// `stdout`/`stderr` files, and a marker file carrying the per-object mode manifest. +#[derive(Debug)] +pub struct UncompressedCacheEntry { + dir: PathBuf, +} + +impl UncompressedCacheEntry { + /// Create a handle to the entry stored at `dir`. + pub fn new(dir: PathBuf) -> Self { + Self { dir } + } + + /// Read the stored stdout, or an empty vector if there is none. + pub fn get_stdout(&self) -> Vec { + self.get_bytes_file("stdout") + } + + /// Read the stored stderr, or an empty vector if there is none. + pub fn get_stderr(&self) -> Vec { + self.get_bytes_file("stderr") + } + + fn get_bytes_file(&self, name: &str) -> Vec { + match fs::read(self.dir.join(name)) { + Ok(bytes) => bytes, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Vec::new(), + Err(e) => { + debug!( + "Failed to read {} from uncompressed cache entry: {}", + name, e + ); + Vec::new() + } + } + } + + /// Restore the requested `objects` into their destinations (reflinking or copying), applying the + /// original mode from the manifest. Returns reflinked-vs-copied counts. + pub async fn extract_objects( + self, + objects: T, + pool: &tokio::runtime::Handle, + ) -> Result + where + T: IntoIterator + Send + Sync + 'static, + { + pool.spawn_blocking(move || { + let modes = parse_mode_manifest(&fs::read(self.dir.join(DIR_ENTRY_MARKER))?); + let objects_dir = self.dir.join(OBJECTS_SUBDIR); + let mut stats = ExtractionStats::default(); + for FileObjectSource { + key, + path, + optional, + } in objects + { + if is_path_null(&path) { + debug!("Skipping output to {}", path.display()); + continue; + } + if let Err(e) = validate_object_key(&key) { + if optional { + continue; + } + return Err(e); + } + let src = objects_dir.join(&key); + if !src.exists() { + if optional { + continue; + } + bail!( + "Required object `{}` not found in uncompressed cache entry", + key + ); + } + let dir = match path.parent() { + Some(d) => d, + None => bail!("Output file without a parent directory!"), + }; + if let Err(e) = fs::create_dir_all(dir) { + if optional { + continue; + } + return Err(e).with_context(|| { + format!("failed to create output directory {}", dir.display()) + }); + } + let mode = modes.get(&key).copied(); + let outcome = match crate::reflink::reflink_or_copy_atomic(&src, &path, mode) { + Ok(outcome) => outcome, + Err(_) => match crate::reflink::reflink_or_copy_direct(&src, &path, mode) { + Ok(outcome) => outcome, + Err(e) => { + if optional { + continue; + } + return Err(anyhow::Error::from(e)).with_context(|| { + format!("failed to restore object `{}` to {}", key, path.display()) + }); + } + }, + }; + stats.record(outcome); + } + Ok(stats) + }) + .await? + } +} + #[cfg(test)] mod tests { use super::*; @@ -472,4 +651,332 @@ mod tests { let result = runtime.block_on(cache_read.extract_objects(objects, pool)); assert!(result.is_ok(), "Extracting to NUL should succeed"); } + + fn current_thread_runtime() -> tokio::runtime::Runtime { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .worker_threads(1) + .build() + .unwrap() + } + + fn make_uncompressed_entry(objects: &[(&str, &[u8], u32)]) -> tempfile::TempDir { + let dir = tempfile::tempdir().unwrap(); + let objects_dir = dir.path().join(OBJECTS_SUBDIR); + std::fs::create_dir(&objects_dir).unwrap(); + let mut manifest = Vec::new(); + for (name, contents, mode) in objects { + std::fs::write(objects_dir.join(name), contents).unwrap(); + manifest.push(((*name).to_string(), *mode)); + } + std::fs::write( + dir.path().join(DIR_ENTRY_MARKER), + serialize_mode_manifest(&manifest), + ) + .unwrap(); + dir + } + + #[test] + fn test_validate_object_key() { + for ok in ["obj", "d", "stdout", "stderr", "output.rlib", "a.b.c"] { + assert!(validate_object_key(ok).is_ok(), "{ok:?} should be valid"); + } + for bad in ["", ".", "..", "a/b", "/a", "a/", "a\0b"] { + assert!( + validate_object_key(bad).is_err(), + "{bad:?} should be invalid" + ); + } + #[cfg(windows)] + assert!(validate_object_key("a\\b").is_err()); + } + + #[test] + fn test_mode_manifest_roundtrip() { + let modes = vec![ + ("obj".to_string(), 0o100644u32), + ("weird key with spaces".to_string(), 0o100755u32), + ]; + let parsed = parse_mode_manifest(&serialize_mode_manifest(&modes)); + assert_eq!(parsed.get("obj"), Some(&0o100644)); + assert_eq!(parsed.get("weird key with spaces"), Some(&0o100755)); + assert!(parse_mode_manifest(b"").is_empty()); + assert!(parse_mode_manifest(b"garbage").is_empty()); + assert!(parse_mode_manifest(b"notanumber\0key\0").is_empty()); + } + + #[test] + fn test_uncompressed_extract_required_create_dir_fails() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + // `blocker` is a regular file, so `blocker/sub` can't be created as a directory. + let blocker = out_dir.path().join("blocker"); + std::fs::write(&blocker, b"x").unwrap(); + let dest = blocker.join("sub").join("out.o"); + + let result = runtime.block_on(entry.extract_objects( + vec![FileObjectSource { + key: "obj".to_string(), + path: dest, + optional: false, + }], + runtime.handle(), + )); + assert!( + result.is_err(), + "required object with un-creatable dest dir must error" + ); + } + + #[test] + fn test_uncompressed_extract_double_failure_required_errors_optional_skips() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + + let out_dir = tempfile::tempdir().unwrap(); + // Destination path is an existing directory: both restore paths fail to write to it. + let dest = out_dir.path().join("dest_is_a_dir"); + std::fs::create_dir(&dest).unwrap(); + + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + let required = vec![FileObjectSource { + key: "obj".to_string(), + path: dest.clone(), + optional: false, + }]; + assert!( + runtime + .block_on(entry.extract_objects(required, runtime.handle())) + .is_err(), + "required object that can't be restored (both attempts fail) must error" + ); + + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + let optional = vec![FileObjectSource { + key: "obj".to_string(), + path: dest.clone(), + optional: true, + }]; + let stats = runtime + .block_on(entry.extract_objects(optional, runtime.handle())) + .unwrap(); + assert_eq!(stats, ExtractionStats::default()); + assert!( + dest.is_dir(), + "optional double-failure leaves dest untouched" + ); + } + + #[test] + fn test_uncompressed_extract_roundtrip() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"object-bytes", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + let out_path = out_dir.path().join("restored.o"); + let objects = vec![FileObjectSource { + key: "obj".to_string(), + path: out_path.clone(), + optional: false, + }]; + + let stats = runtime + .block_on(entry.extract_objects(objects, runtime.handle())) + .unwrap(); + assert_eq!(std::fs::read(&out_path).unwrap(), b"object-bytes"); + assert_eq!(stats.objects_reflinked + stats.objects_copied, 1); + } + + #[test] + fn test_uncompressed_extract_reflinks_on_cow() { + let out_dir = tempfile::tempdir().unwrap(); + if !crate::reflink::is_reflink_supported(out_dir.path()) { + return; // non-CoW filesystem: covered by the FS-agnostic test above. + } + // The cache entry must live on the same filesystem as the destination to reflink. + let entry_dir = tempfile::tempdir_in(out_dir.path()).unwrap(); + let objects_dir = entry_dir.path().join(OBJECTS_SUBDIR); + std::fs::create_dir(&objects_dir).unwrap(); + let data = vec![7u8; 256 * 1024]; + std::fs::write(objects_dir.join("obj"), &data).unwrap(); + std::fs::write( + entry_dir.path().join(DIR_ENTRY_MARKER), + serialize_mode_manifest(&[("obj".to_string(), 0o100644)]), + ) + .unwrap(); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let runtime = current_thread_runtime(); + let out_path = out_dir.path().join("restored.o"); + let stats = runtime + .block_on(entry.extract_objects( + vec![FileObjectSource { + key: "obj".to_string(), + path: out_path.clone(), + optional: false, + }], + runtime.handle(), + )) + .unwrap(); + assert_eq!(std::fs::read(&out_path).unwrap(), data); + assert_eq!( + stats.objects_reflinked, 1, + "should reflink on a CoW filesystem" + ); + assert_eq!(stats.objects_copied, 0); + } + + #[cfg(unix)] + #[test] + fn test_uncompressed_extract_to_devnull() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let objects = vec![FileObjectSource { + key: "obj".to_string(), + path: PathBuf::from("/dev/null"), + optional: false, + }]; + let stats = runtime + .block_on(entry.extract_objects(objects, runtime.handle())) + .unwrap(); + assert_eq!(stats, ExtractionStats::default()); + } + + #[test] + fn test_uncompressed_extract_missing_optional_is_skipped() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + let objects = vec![FileObjectSource { + key: "missing".to_string(), + path: out_dir.path().join("missing.d"), + optional: true, + }]; + let stats = runtime + .block_on(entry.extract_objects(objects, runtime.handle())) + .unwrap(); + assert_eq!(stats, ExtractionStats::default()); + assert!(!out_dir.path().join("missing.d").exists()); + } + + #[test] + fn test_uncompressed_extract_missing_required_errors() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + let objects = vec![FileObjectSource { + key: "missing".to_string(), + path: out_dir.path().join("missing.o"), + optional: false, + }]; + let result = runtime.block_on(entry.extract_objects(objects, runtime.handle())); + assert!(result.is_err(), "missing required object should error"); + } + + #[test] + fn test_uncompressed_extract_rejects_bad_key() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"data", 0o100644)]); + let out_dir = tempfile::tempdir().unwrap(); + + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + let required = vec![FileObjectSource { + key: "../escape".to_string(), + path: out_dir.path().join("escape"), + optional: false, + }]; + assert!( + runtime + .block_on(entry.extract_objects(required, runtime.handle())) + .is_err() + ); + + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + let optional = vec![FileObjectSource { + key: "../escape".to_string(), + path: out_dir.path().join("escape"), + optional: true, + }]; + let stats = runtime + .block_on(entry.extract_objects(optional, runtime.handle())) + .unwrap(); + assert_eq!(stats, ExtractionStats::default()); + } + + #[test] + fn test_uncompressed_extract_to_non_writable_dir() { + let runtime = current_thread_runtime(); + let entry_dir = make_uncompressed_entry(&[("obj", b"restored-content", 0o100644)]); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + let target = out_dir.path().join("out.o"); + std::fs::write(&target, b"stale").unwrap(); + let mut perm = out_dir.path().metadata().unwrap().permissions(); + perm.set_readonly(true); + std::fs::set_permissions(out_dir.path(), perm.clone()).unwrap(); + + let result = runtime.block_on(entry.extract_objects( + vec![FileObjectSource { + key: "obj".to_string(), + path: target.clone(), + optional: false, + }], + runtime.handle(), + )); + + // Reset permissions so the tempdir can be cleaned up regardless of outcome. + #[allow( + clippy::permissions_set_readonly_false, + reason = "directory is deleted immediately; no security implication" + )] + perm.set_readonly(false); + std::fs::set_permissions(out_dir.path(), perm).unwrap(); + + assert!( + result.is_ok(), + "extract to a non-writable dir should succeed" + ); + assert_eq!(std::fs::read(&target).unwrap(), b"restored-content"); + } + + #[cfg(unix)] + #[test] + fn test_uncompressed_extract_restores_output_mode_from_manifest() { + use std::os::unix::fs::PermissionsExt; + let runtime = current_thread_runtime(); + // Manifest records the original 0755 output mode even though the cache object on disk is + // written 0600 by the real write path; restore must reproduce 0755. + let entry_dir = make_uncompressed_entry(&[("bin", b"#!/bin/sh\n", 0o100755)]); + std::fs::set_permissions( + entry_dir.path().join(OBJECTS_SUBDIR).join("bin"), + std::fs::Permissions::from_mode(0o600), + ) + .unwrap(); + let entry = UncompressedCacheEntry::new(entry_dir.path().to_path_buf()); + + let out_dir = tempfile::tempdir().unwrap(); + let out_path = out_dir.path().join("restored.sh"); + let objects = vec![FileObjectSource { + key: "bin".to_string(), + path: out_path.clone(), + optional: false, + }]; + runtime + .block_on(entry.extract_objects(objects, runtime.handle())) + .unwrap(); + let mode = std::fs::metadata(&out_path).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o755, "restored output reproduces the original mode"); + } } diff --git a/src/cache/disk.rs b/src/cache/disk.rs index ebd65d1a3f..27e4030ede 100644 --- a/src/cache/disk.rs +++ b/src/cache/disk.rs @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::cache::{Cache, CacheMode, CacheRead, CacheWrite, Storage}; +use crate::cache::cache_io::{ + FileObjectSource, OBJECTS_SUBDIR, is_path_null, serialize_mode_manifest, validate_object_key, +}; +use crate::cache::{Cache, CacheMode, CacheRead, CacheWrite, Storage, UncompressedCacheEntry}; use crate::compiler::PreprocessorCacheEntry; -use crate::lru_disk_cache::{Error as LruError, ReadSeek}; +use crate::lru_disk_cache::{DIR_ENTRY_MARKER, Error as LruError, ReadSeek, TEMPFILE_PREFIX}; use async_trait::async_trait; use bytes::Bytes; +use fs_err as fs; use std::ffi::OsStr; use std::io::{BufWriter, Read, Write}; use std::path::{Path, PathBuf}; @@ -26,7 +30,7 @@ use std::time::{Duration, Instant}; use crate::errors::*; use super::lazy_disk_cache::LazyDiskCache; -use super::utils::normalize_key; +use super::utils::{file_mode_of, normalize_key, set_file_mode}; use crate::config::PreprocessorCacheModeConfig; /// A cache that stores entries at local disk paths. @@ -39,6 +43,8 @@ pub struct DiskCache { preprocessor_cache: Arc>, rw_mode: CacheMode, basedirs: Vec>, + /// `file_clone`: store entries uncompressed and restore them via reflinks. + use_uncompressed: bool, } impl DiskCache { @@ -50,11 +56,59 @@ impl DiskCache { preprocessor_cache_mode_config: PreprocessorCacheModeConfig, rw_mode: CacheMode, basedirs: Vec>, + file_clone: bool, ) -> DiskCache { + if file_clone { + let root_path = Path::new(root.as_ref()); + #[cfg(unix)] + let preexisting_mode = { + use std::os::unix::fs::PermissionsExt; + std::fs::metadata(root_path) + .ok() + .map(|m| m.permissions().mode() & 0o7777) + }; + let _ = std::fs::create_dir_all(root_path); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Some(mode) = preexisting_mode { + if mode & 0o077 != 0 { + warn!( + "file_clone: tightening pre-existing cache directory {root_path:?} \ + permissions from {mode:#o} to 0700 (an uncompressed cache must be \ + user-private). Do not point SCCACHE_DIR at a shared/group cache \ + directory when using file_clone." + ); + } + } + match std::fs::set_permissions(root_path, std::fs::Permissions::from_mode(0o700)) { + Ok(()) => { + debug!("file_clone: cache directory {root_path:?} is private (0700)"); + } + Err(e) => warn!( + "file_clone: could not make cache directory {root_path:?} private (chmod \ + 0700 failed: {e}). An uncompressed cache readable/writable by other users \ + can be poisoned; point SCCACHE_DIR at a directory you own." + ), + } + } + if crate::reflink::is_reflink_supported(root_path) { + debug!("file_clone enabled: uncompressed storage with reflink (copy-on-write)"); + } else { + warn!( + "file_clone enabled but the cache directory's filesystem does not support \ + reflinks: entries are stored uncompressed and restored via copies, so they \ + will not share disk blocks with the cache. Put the cache directory and the \ + build directory on the same copy-on-write filesystem (Btrfs/XFS/APFS/ReFS) \ + for the full benefit." + ); + } + } DiskCache { lru: Arc::new(Mutex::new(LazyDiskCache::Uninit { root: root.as_ref().to_os_string(), max_size, + support_dir_entries: file_clone, })), pool: pool.clone(), preprocessor_cache_mode_config, @@ -63,9 +117,11 @@ impl DiskCache { .join("preprocessor") .into_os_string(), max_size, + support_dir_entries: false, })), rw_mode, basedirs, + use_uncompressed: file_clone, } } } @@ -75,6 +131,17 @@ fn make_key_path(key: &str) -> PathBuf { Path::new(&key[0..1]).join(&key[1..2]).join(key) } +fn is_uncompressed_entry(cache_root: &Path, key_path: &Path) -> bool { + let path = cache_root.join(key_path); + path.is_dir() && path.join(DIR_ENTRY_MARKER).exists() +} + +fn write_private(path: &Path, bytes: &[u8]) -> Result<()> { + fs::write(path, bytes)?; + set_file_mode(path, 0o600)?; + Ok(()) +} + #[async_trait] impl Storage for DiskCache { async fn get(&self, key: &str) -> Result { @@ -82,10 +149,25 @@ impl Storage for DiskCache { let path = make_key_path(key); let lru = self.lru.clone(); let key = key.to_owned(); + let use_uncompressed = self.use_uncompressed; self.pool .spawn_blocking(move || { - let io = match lru.lock().unwrap().get_or_init()?.get(&path) { + let mut binding = lru.lock().unwrap(); + let cache = binding.get_or_init()?; + + if use_uncompressed { + let cache_root = cache.path().to_path_buf(); + if is_uncompressed_entry(&cache_root, &path) { + let _ = cache.touch(&path); + trace!("DiskCache::get({}): UncompressedHit", key); + return Ok(Cache::UncompressedHit(UncompressedCacheEntry::new( + cache_root.join(&path), + ))); + } + } + + let io = match cache.get(&path) { Ok(f) => f, Err(LruError::FileNotInCache) => { trace!("DiskCache::get({}): FileNotInCache", key); @@ -108,10 +190,20 @@ impl Storage for DiskCache { let path = make_key_path(key); let lru = self.lru.clone(); let key = key.to_owned(); + let use_uncompressed = self.use_uncompressed; self.pool - .spawn_blocking( - move || match lru.lock().unwrap().get_or_init()?.get(&path) { + .spawn_blocking(move || { + let mut binding = lru.lock().unwrap(); + let cache = binding.get_or_init()?; + if use_uncompressed && is_uncompressed_entry(cache.path(), &path) { + trace!( + "DiskCache::get_raw({}): uncompressed entry, returning None", + key + ); + return Ok(None); + } + match cache.get(&path) { Ok(mut io) => { let mut data = Vec::new(); io.read_to_end(&mut data)?; @@ -127,8 +219,8 @@ impl Storage for DiskCache { Err(e.into()) } Err(_) => unreachable!(), - }, - ) + } + }) .await? } @@ -164,6 +256,129 @@ impl Storage for DiskCache { .await? } + async fn put_objects( + &self, + key: &str, + objects: Vec, + stdout: Vec, + stderr: Vec, + pool: &tokio::runtime::Handle, + ) -> Result { + if self.rw_mode == CacheMode::ReadOnly { + return Err(anyhow!("Cannot write to a read-only cache")); + } + + if !self.use_uncompressed { + let mut entry = CacheWrite::from_objects(objects, pool).await?; + entry.put_stdout(&stdout)?; + entry.put_stderr(&stderr)?; + return self.put(key, entry).await; + } + + let lru = self.lru.clone(); + let key_path = make_key_path(key); + + pool.spawn_blocking(move || { + let start = Instant::now(); + let cache_root = { + let mut binding = lru.lock().unwrap(); + binding.get_or_init()?.path().to_path_buf() + }; + + let mut builder = tempfile::Builder::new(); + builder.prefix(TEMPFILE_PREFIX); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + builder.permissions(std::fs::Permissions::from_mode(0o700)); + } + let staging = builder.tempdir_in(&cache_root)?; + let objects_dir = staging.path().join(OBJECTS_SUBDIR); + #[cfg(unix)] + { + use std::os::unix::fs::DirBuilderExt; + std::fs::DirBuilder::new() + .mode(0o700) + .create(&objects_dir)?; + } + #[cfg(not(unix))] + fs::create_dir(&objects_dir)?; + + let mut mode_manifest: Vec<(String, u32)> = Vec::new(); + let mut stored_count = 0usize; + + for FileObjectSource { + key: obj_key, + path, + optional, + } in objects + { + if let Err(e) = validate_object_key(&obj_key) { + if optional { + continue; + } + return Err(e); + } + let dest = objects_dir.join(&obj_key); + if is_path_null(&path) { + write_private(&dest, b"")?; + stored_count += 1; + continue; + } + let mode = match fs::metadata(&path) { + Ok(meta) => file_mode_of(&meta), + Err(e) => { + if optional { + continue; + } + return Err(e).with_context(|| { + format!("failed to read compiler output `{}`", path.display()) + }); + } + }; + if let Err(e) = crate::reflink::reflink_or_copy_new(&path, &dest, Some(0o600)) { + let _ = fs::remove_file(&dest); + if optional { + continue; + } + return Err(anyhow::Error::from(e)).with_context(|| { + format!("failed to store compiler output `{}`", path.display()) + }); + } + if let Some(mode) = mode { + mode_manifest.push((obj_key, mode)); + } + stored_count += 1; + } + + if stored_count == 0 { + return Ok(start.elapsed()); + } + + if !stdout.is_empty() { + write_private(&staging.path().join("stdout"), &stdout)?; + } + if !stderr.is_empty() { + write_private(&staging.path().join("stderr"), &stderr)?; + } + write_private( + &staging.path().join(DIR_ENTRY_MARKER), + &serialize_mode_manifest(&mode_manifest), + )?; + + let mut binding = lru.lock().unwrap(); + let cache = binding.get_or_init()?; + #[allow(deprecated)] // `into_path` is not deprecated in the locked tempfile 3.10.1 + let staging_path = staging.into_path(); + if let Err(e) = cache.insert_dir(&key_path, &staging_path) { + let _ = fs::remove_dir_all(&staging_path); + return Err(e.into()); + } + Ok(start.elapsed()) + }) + .await? + } + async fn check(&self) -> Result { Ok(self.rw_mode) } @@ -229,6 +444,40 @@ impl Storage for DiskCache { mod tests { use super::*; + fn new_disk_cache( + root: &Path, + runtime: &tokio::runtime::Runtime, + file_clone: bool, + ) -> DiskCache { + DiskCache::new( + root, + 100 * 1024 * 1024, + runtime.handle(), + PreprocessorCacheModeConfig::default(), + CacheMode::ReadWrite, + vec![], + file_clone, + ) + } + + fn fobj(key: &str, path: &Path) -> FileObjectSource { + FileObjectSource { + key: key.to_string(), + path: path.to_path_buf(), + optional: false, + } + } + + const TEST_KEY: &str = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"; + + fn mt_runtime() -> tokio::runtime::Runtime { + tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap() + } + #[test] fn test_disk_cache_type_name() { let tempdir = tempfile::tempdir().unwrap(); @@ -236,15 +485,562 @@ mod tests { .build() .unwrap(); - let disk = DiskCache::new( - tempdir.path(), - 1024 * 1024, + let disk = new_disk_cache(tempdir.path(), &runtime, false); + + assert_eq!(disk.cache_type_name(), "disk"); + } + + #[test] + fn test_uncompressed_put_objects_get_roundtrip() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let build = tempdir.path().join("build"); + std::fs::create_dir_all(&build).unwrap(); + let obj_path = build.join("output.o"); + std::fs::write(&obj_path, b"raw object bytes").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&obj_path, std::fs::Permissions::from_mode(0o644)).unwrap(); + } + + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + b"the stdout".to_vec(), + b"the stderr".to_vec(), + runtime.handle(), + )) + .unwrap(); + + let entry_dir = cache_dir.join(make_key_path(TEST_KEY)); + assert!(entry_dir.is_dir(), "entry should be a directory"); + assert!(entry_dir.join(DIR_ENTRY_MARKER).exists(), "marker present"); + let obj_file = entry_dir.join(OBJECTS_SUBDIR).join("obj"); + assert_eq!( + std::fs::read(&obj_file).unwrap(), + b"raw object bytes", + "stored object is uncompressed and byte-identical" + ); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + assert_eq!( + std::fs::metadata(&obj_file).unwrap().permissions().mode() & 0o777, + 0o600, + "cache object must be stored 0600" + ); + } + + match runtime.block_on(disk.get(TEST_KEY)).unwrap() { + Cache::UncompressedHit(entry) => { + assert_eq!(entry.get_stdout(), b"the stdout"); + assert_eq!(entry.get_stderr(), b"the stderr"); + } + other => panic!("expected UncompressedHit, got {other:?}"), + } + } + + #[cfg(unix)] + #[test] + fn test_uncompressed_put_objects_dirs_are_private() { + use std::os::unix::fs::PermissionsExt; + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"data").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + + let entry_dir = cache_dir.join(make_key_path(TEST_KEY)); + let mode_of = |p: &Path| std::fs::metadata(p).unwrap().permissions().mode() & 0o777; + assert_eq!( + mode_of(&entry_dir), + 0o700, + "entry directory must be user-private (0700)" + ); + assert_eq!( + mode_of(&entry_dir.join(OBJECTS_SUBDIR)), + 0o700, + "objects/ subdir must be user-private (0700)" + ); + } + + #[cfg(unix)] + #[test] + fn test_uncompressed_empty_stdio_and_mode_restore() { + use std::os::unix::fs::PermissionsExt; + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let bin = tempdir.path().join("a.out"); + std::fs::write(&bin, b"#!/bin/sh\n").unwrap(); + std::fs::set_permissions(&bin, std::fs::Permissions::from_mode(0o755)).unwrap(); + + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &bin)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + + let entry_dir = cache_dir.join(make_key_path(TEST_KEY)); + assert!( + !entry_dir.join("stdout").exists(), + "empty stdout not stored" + ); + assert!( + !entry_dir.join("stderr").exists(), + "empty stderr not stored" + ); + + let out = tempdir.path().join("restored.out"); + let Cache::UncompressedHit(entry) = runtime.block_on(disk.get(TEST_KEY)).unwrap() else { + panic!("expected UncompressedHit"); + }; + runtime + .block_on(entry.extract_objects(vec![fobj("obj", &out)], runtime.handle())) + .unwrap(); + assert_eq!( + std::fs::metadata(&out).unwrap().permissions().mode() & 0o777, + 0o755 + ); + } + + #[test] + fn test_object_key_named_stdout_does_not_collide() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj = tempdir.path().join("obj_named_stdout"); + std::fs::write(&obj, b"object-content").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("stdout", &obj)], + b"PROCESS STDOUT".to_vec(), + vec![], + runtime.handle(), + )) + .unwrap(); + + let out = tempdir.path().join("restored_stdout_obj"); + let Cache::UncompressedHit(entry) = runtime.block_on(disk.get(TEST_KEY)).unwrap() else { + panic!("expected UncompressedHit"); + }; + assert_eq!(entry.get_stdout(), b"PROCESS STDOUT"); + runtime + .block_on(entry.extract_objects(vec![fobj("stdout", &out)], runtime.handle())) + .unwrap(); + assert_eq!(std::fs::read(&out).unwrap(), b"object-content"); + } + + #[test] + fn test_uncompressed_extract_roundtrip() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"hello reflink world").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + + let restore_path = tempdir.path().join("restored").join("out.o"); + let cache = runtime.block_on(disk.get(TEST_KEY)).unwrap(); + let Cache::UncompressedHit(entry) = cache else { + panic!("expected UncompressedHit"); + }; + let stats = runtime + .block_on(entry.extract_objects(vec![fobj("obj", &restore_path)], runtime.handle())) + .unwrap(); + assert_eq!( + std::fs::read(&restore_path).unwrap(), + b"hello reflink world" + ); + assert_eq!(stats.objects_reflinked + stats.objects_copied, 1); + } + + #[test] + fn test_mode_switch_compressed_then_uncompressed() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let cache_dir = tempdir.path().join("cache"); + + { + let disk = new_disk_cache(&cache_dir, &runtime, false); + let mut entry = CacheWrite::new(); + entry + .put_object("obj", &mut std::io::Cursor::new(b"compressed"), Some(0o644)) + .unwrap(); + runtime.block_on(disk.put(TEST_KEY, entry)).unwrap(); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::Hit(_) + )); + } + { + let disk = new_disk_cache(&cache_dir, &runtime, true); + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"uncompressed").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::UncompressedHit(_) + )); + let entry_path = cache_dir.join(make_key_path(TEST_KEY)); + assert!(entry_path.is_dir(), "key should now be a directory"); + } + } + + #[test] + fn test_mode_switch_uncompressed_then_compressed() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"uncompressed").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + assert!(cache_dir.join(make_key_path(TEST_KEY)).is_dir()); + + let mut entry = CacheWrite::new(); + entry + .put_object("obj", &mut std::io::Cursor::new(b"compressed"), Some(0o644)) + .unwrap(); + runtime.block_on(disk.put(TEST_KEY, entry)).unwrap(); + assert!(cache_dir.join(make_key_path(TEST_KEY)).is_file()); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::Hit(_) + )); + } + + #[test] + fn test_default_cache_uses_compressed_entries() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, false); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"data").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + assert!(cache_dir.join(make_key_path(TEST_KEY)).is_file()); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::Hit(_) + )); + } + + #[test] + fn test_preprocessor_cache_untouched_with_file_clone() { + use crate::compiler::PreprocessorCacheEntry; + + let tempdir = tempfile::tempdir().unwrap(); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"obj").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + let object_size = runtime.block_on(disk.current_size()).unwrap(); + runtime + .block_on(disk.put_preprocessor_cache_entry(TEST_KEY, PreprocessorCacheEntry::new())) + .unwrap(); + + let preprocessor_root = cache_dir.join("preprocessor"); + assert!( + preprocessor_root.is_dir(), + "preprocessor subtree must exist" + ); + + // Re-open the object cache and force its file_clone init walk, which prunes the sibling + // preprocessor subtree from the object cache's own bookkeeping. + drop(disk); + let disk = new_disk_cache(&cache_dir, &runtime, true); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::UncompressedHit(_) + )); + + // That init must leave the preprocessor subtree and its entry untouched on disk. + assert!( + preprocessor_root.is_dir(), + "preprocessor subtree must survive object cache re-init" + ); + let preprocessor_entry = cache_dir + .join("preprocessor") + .join(&TEST_KEY[0..1]) + .join(&TEST_KEY[1..2]) + .join(&TEST_KEY[2..3]) + .join(TEST_KEY); + assert!( + preprocessor_entry.is_file(), + "preprocessor entry must not be pruned by the object cache" + ); + + // The object cache must not count the preprocessor files toward its size. + assert_eq!( + runtime.block_on(disk.current_size()).unwrap(), + object_size, + "object cache must not count preprocessor files after re-init" + ); + } + + #[test] + fn test_get_raw_uncompressed_returns_none() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + + let obj_path = tempdir.path().join("orig.o"); + std::fs::write(&obj_path, b"data").unwrap(); + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![fobj("obj", &obj_path)], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + assert!( + runtime.block_on(disk.get_raw(TEST_KEY)).unwrap().is_none(), + "uncompressed entry must not expose raw bytes" + ); + + // A compressed entry on the same cache does expose raw bytes. + const KEY2: &str = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210"; + let mut entry = CacheWrite::new(); + entry + .put_object("obj", &mut std::io::Cursor::new(b"x"), Some(0o644)) + .unwrap(); + runtime.block_on(disk.put(KEY2, entry)).unwrap(); + assert!(runtime.block_on(disk.get_raw(KEY2)).unwrap().is_some()); + } + + #[test] + fn test_put_objects_branches() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + + let ro_dir = tempdir.path().join("ro"); + let ro = DiskCache::new( + &ro_dir, + 100 * 1024 * 1024, runtime.handle(), PreprocessorCacheModeConfig::default(), - CacheMode::ReadWrite, + CacheMode::ReadOnly, vec![], + true, + ); + let f = tempdir.path().join("f"); + std::fs::write(&f, b"x").unwrap(); + assert!( + runtime + .block_on(ro.put_objects( + TEST_KEY, + vec![fobj("obj", &f)], + vec![], + vec![], + runtime.handle() + )) + .is_err() ); - assert_eq!(disk.cache_type_name(), "disk"); + let cache_dir = tempdir.path().join("cache"); + let disk = new_disk_cache(&cache_dir, &runtime, true); + let missing = FileObjectSource { + key: "obj".to_string(), + path: tempdir.path().join("does-not-exist"), + optional: true, + }; + runtime + .block_on(disk.put_objects(TEST_KEY, vec![missing], vec![], vec![], runtime.handle())) + .unwrap(); + assert!( + !cache_dir.join(make_key_path(TEST_KEY)).exists(), + "no entry for empty object set" + ); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::Miss + )); + + let required_missing = FileObjectSource { + key: "obj".to_string(), + path: tempdir.path().join("also-missing"), + optional: false, + }; + assert!( + runtime + .block_on(disk.put_objects( + TEST_KEY, + vec![required_missing], + vec![], + vec![], + runtime.handle() + )) + .is_err() + ); + + #[cfg(unix)] + { + const KEY3: &str = "1111111111111111111111111111111111111111111111111111111111111111"; + runtime + .block_on(disk.put_objects( + KEY3, + vec![fobj("obj", Path::new("/dev/null"))], + vec![], + vec![], + runtime.handle(), + )) + .unwrap(); + let obj = cache_dir + .join(make_key_path(KEY3)) + .join(OBJECTS_SUBDIR) + .join("obj"); + assert!(obj.exists(), "null output stored as empty object"); + assert_eq!(std::fs::metadata(&obj).unwrap().len(), 0); + } + } + + #[test] + fn test_concurrent_same_key_put_objects() { + let tempdir = tempfile::tempdir().unwrap(); + let runtime = mt_runtime(); + let cache_dir = tempdir.path().join("cache"); + let disk = std::sync::Arc::new(new_disk_cache(&cache_dir, &runtime, true)); + + runtime.block_on(async { + let mut handles = Vec::new(); + for i in 0..8u8 { + let disk = disk.clone(); + let src = tempdir.path().join(format!("src{i}")); + std::fs::write(&src, vec![b'a' + i; 4096]).unwrap(); + let handle = disk.pool.clone(); + handles.push(tokio::spawn(async move { + disk.put_objects( + TEST_KEY, + vec![FileObjectSource { + key: "obj".to_string(), + path: src, + optional: false, + }], + vec![], + vec![], + &handle, + ) + .await + })); + } + for h in handles { + h.await.unwrap().unwrap(); + } + }); + + let entry_dir = cache_dir.join(make_key_path(TEST_KEY)); + assert!(entry_dir.join(DIR_ENTRY_MARKER).exists()); + let content = std::fs::read(entry_dir.join(OBJECTS_SUBDIR).join("obj")).unwrap(); + assert_eq!(content.len(), 4096); + assert!((b'a'..=b'h').contains(&content[0]) && content.iter().all(|&b| b == content[0])); + let leftovers: Vec<_> = std::fs::read_dir(&cache_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| e.file_name().to_string_lossy().starts_with(TEMPFILE_PREFIX)) + .collect(); + assert!( + leftovers.is_empty(), + "no leftover .sccachetmp* staging dirs" + ); + assert!(matches!( + runtime.block_on(disk.get(TEST_KEY)).unwrap(), + Cache::UncompressedHit(_) + )); } } diff --git a/src/cache/lazy_disk_cache.rs b/src/cache/lazy_disk_cache.rs index 0963522f67..a11e0719e8 100644 --- a/src/cache/lazy_disk_cache.rs +++ b/src/cache/lazy_disk_cache.rs @@ -16,15 +16,27 @@ use std::ffi::OsString; use std::path::Path; pub enum LazyDiskCache { - Uninit { root: OsString, max_size: u64 }, + Uninit { + root: OsString, + max_size: u64, + support_dir_entries: bool, + }, Init(LruDiskCache), } impl LazyDiskCache { pub fn get_or_init(&mut self) -> Result<&mut LruDiskCache> { match self { - LazyDiskCache::Uninit { root, max_size } => { - *self = LazyDiskCache::Init(LruDiskCache::new(&root, *max_size)?); + LazyDiskCache::Uninit { + root, + max_size, + support_dir_entries, + } => { + *self = LazyDiskCache::Init(LruDiskCache::new_with_dir_entries( + &root, + *max_size, + *support_dir_entries, + )?); self.get_or_init() } LazyDiskCache::Init(d) => Ok(d), diff --git a/src/cache/multilevel.rs b/src/cache/multilevel.rs index f9296a4ab1..fdde93364f 100644 --- a/src/cache/multilevel.rs +++ b/src/cache/multilevel.rs @@ -413,6 +413,7 @@ impl MultiLevelStorage { preprocessor_cache_mode_config, rw_mode, config.basedirs.clone(), + false, // file_clone: only honoured for a single-level disk cache )); storages.push(disk_storage); trace!("Added disk cache level"); @@ -566,7 +567,7 @@ impl Storage for MultiLevelStorage { for (idx, level) in self.levels.iter().enumerate() { let start = Instant::now(); match level.get(key).await { - Ok(Cache::Hit(entry)) => { + Ok(hit @ (Cache::Hit(_) | Cache::UncompressedHit(_))) => { let duration = start.elapsed(); debug!("Cache hit at level {} in {:?}", idx, duration); @@ -645,7 +646,7 @@ impl Storage for MultiLevelStorage { } } - return Ok(Cache::Hit(entry)); + return Ok(hit); } Ok(Cache::Miss) => { trace!("Cache miss at level {}, trying next level", idx); diff --git a/src/cache/multilevel_test.rs b/src/cache/multilevel_test.rs index bfc383bbcb..c3a3aa2e73 100644 --- a/src/cache/multilevel_test.rs +++ b/src/cache/multilevel_test.rs @@ -59,6 +59,7 @@ fn test_multi_level_storage_get() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, ); let cache2 = DiskCache::new( &cache_dir2, @@ -67,6 +68,7 @@ fn test_multi_level_storage_get() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, ); let cache1_storage: Arc = Arc::new(cache1); @@ -131,6 +133,7 @@ fn test_multi_level_storage_backfill_on_hit() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, ); let cache2 = DiskCache::new( &cache_dir2, @@ -139,6 +142,7 @@ fn test_multi_level_storage_backfill_on_hit() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, ); let cache1_storage: Arc = Arc::new(cache1); @@ -295,6 +299,7 @@ fn test_disk_plus_remote_to_remote_backfill() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, )); let remote_l1 = Arc::new(InMemoryStorage::new()); // Memcached-like @@ -398,6 +403,7 @@ fn test_disk_plus_remotes_write_to_all() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, )); let remote_l1 = Arc::new(InMemoryStorage::new()); @@ -861,6 +867,7 @@ fn test_preprocessor_cache_mode() { preprocessor_config, CacheMode::ReadWrite, vec![], + false, )); let cache_l1 = Arc::new(InMemoryStorage::new()); @@ -912,6 +919,7 @@ fn test_preprocessor_cache_methods() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, )); let storage = MultiLevelStorage::new(vec![disk_cache as Arc]); @@ -954,6 +962,7 @@ fn test_readonly_level_in_check() { PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, ); // Wrap in ReadOnly @@ -1341,3 +1350,76 @@ fn test_put_mode_all_skips_readonly() { )); }); } + +struct UncompressedHitStorage { + dir: std::path::PathBuf, +} + +#[async_trait] +impl Storage for UncompressedHitStorage { + async fn get(&self, _key: &str) -> Result { + Ok(Cache::UncompressedHit( + crate::cache::UncompressedCacheEntry::new(self.dir.clone()), + )) + } + async fn put(&self, _key: &str, _entry: CacheWrite) -> Result { + Ok(Duration::ZERO) + } + fn location(&self) -> String { + "UncompressedHit".to_string() + } + async fn current_size(&self) -> Result> { + Ok(None) + } + async fn max_size(&self) -> Result> { + Ok(None) + } + // `get_raw` uses the trait default (`Ok(None)`), so no backfill source is available. +} + +#[test] +fn test_multilevel_uncompressed_hit_counted_no_backfill() { + let runtime = RuntimeBuilder::new_multi_thread() + .enable_all() + .worker_threads(1) + .build() + .unwrap(); + + let l0 = Arc::new(InMemoryStorage::new()); + let entry_dir = TempBuilder::new().prefix("uhit").tempdir().unwrap(); + fs::create_dir(entry_dir.path().join("objects")).unwrap(); + fs::write( + entry_dir + .path() + .join(crate::lru_disk_cache::DIR_ENTRY_MARKER), + b"", + ) + .unwrap(); + let l1 = Arc::new(UncompressedHitStorage { + dir: entry_dir.path().to_path_buf(), + }); + + let storage = + MultiLevelStorage::new(vec![l0.clone() as Arc, l1 as Arc]); + + runtime.block_on(async { + match storage.get("k").await.unwrap() { + Cache::UncompressedHit(_) => {} + other => panic!("expected UncompressedHit, got {other:?}"), + } + let stats = storage.stats(); + assert_eq!(stats.0[0].hits, 0, "L0 must not be credited with a hit"); + assert_eq!(stats.0[0].misses, 1, "L0 was checked and missed before L1"); + assert_eq!( + stats.0[1].hits, 1, + "L1 uncompressed hit must be counted in the per-level hit stat" + ); + assert_eq!(stats.0[1].misses, 0, "L1 hit, so no miss recorded for it"); + // Allow any (would-be) backfill task to run; none should, since get_raw() is None. + sleep(Duration::from_millis(100)).await; + assert!( + matches!(l0.get("k").await.unwrap(), Cache::Miss), + "L0 must not be backfilled from an uncompressed (get_raw=None) level" + ); + }); +} diff --git a/src/cache/readonly.rs b/src/cache/readonly.rs index 40f9873f5f..3a38be7120 100644 --- a/src/cache/readonly.rs +++ b/src/cache/readonly.rs @@ -37,6 +37,18 @@ impl Storage for ReadOnlyStorage { Err(anyhow!("Cannot write to read-only storage")) } + /// Reject immediately, skipping the default `put_objects`'s compression of a guaranteed failure. + async fn put_objects( + &self, + _key: &str, + _objects: Vec, + _stdout: Vec, + _stderr: Vec, + _pool: &tokio::runtime::Handle, + ) -> Result { + Err(anyhow!("Cannot write to read-only storage")) + } + /// Check the cache capability. /// /// The ReadOnlyStorage cache is always read-only. @@ -163,6 +175,7 @@ mod test { super::PreprocessorCacheModeConfig::default(), super::CacheMode::ReadWrite, basedirs.clone(), + false, ); let readonly_storage = ReadOnlyStorage(std::sync::Arc::new(disk_cache)); @@ -196,6 +209,15 @@ mod test { .to_string(), "Cannot write to read-only storage" ); + let runtime = tokio::runtime::Handle::current(); + assert_eq!( + storage + .put_objects("test1", vec![], Vec::new(), Vec::new(), &runtime) + .await + .unwrap_err() + .to_string(), + "Cannot write to read-only storage" + ); }); } @@ -221,6 +243,7 @@ mod test { super::PreprocessorCacheModeConfig::default(), super::CacheMode::ReadWrite, vec![], + false, ); let readonly_storage = ReadOnlyStorage(std::sync::Arc::new(disk_cache)); diff --git a/src/cache/utils.rs b/src/cache/utils.rs index ee1186fd48..94cf477082 100644 --- a/src/cache/utils.rs +++ b/src/cache/utils.rs @@ -33,6 +33,18 @@ pub(in crate::cache) fn get_file_mode(_file: &fs::File) -> Result> { Ok(None) } +/// Extract the unix mode bits from already-fetched metadata (no extra `stat`/`open`). +#[cfg(unix)] +pub(in crate::cache) fn file_mode_of(meta: &std::fs::Metadata) -> Option { + use std::os::unix::fs::MetadataExt; + Some(meta.mode()) +} + +#[cfg(windows)] +pub(in crate::cache) fn file_mode_of(_meta: &std::fs::Metadata) -> Option { + None +} + #[cfg(unix)] pub(in crate::cache) fn set_file_mode(path: &Path, mode: u32) -> Result<()> { use std::fs::Permissions; diff --git a/src/compiler/compiler.rs b/src/compiler/compiler.rs index 6a069dbc8e..74f4007759 100644 --- a/src/compiler/compiler.rs +++ b/src/compiler/compiler.rs @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::cache::{Cache, CacheWrite, DecompressionFailure, FileObjectSource, Storage}; +use crate::cache::{Cache, DecompressionFailure, ExtractionStats, FileObjectSource, Storage}; use crate::compiler::args::*; use crate::compiler::c::{CCompiler, CCompilerKind}; use crate::compiler::cicc::Cicc; @@ -601,6 +601,23 @@ where }) .collect::>(); + // In locally-preprocessed mode the dependency file ("d") was already produced + // locally, so don't overwrite it from the cache. + fn filtered_restore_outputs( + locally_preprocessed: bool, + outputs: &[FileObjectSource], + ) -> Vec { + if locally_preprocessed { + outputs + .iter() + .filter(|fobj_source| fobj_source.key != "d") + .cloned() + .collect() + } else { + outputs.to_vec() + } + } + let lookup = match cache_status.await { (Ok(Ok(Cache::Hit(mut entry))), duration) => { debug!( @@ -614,30 +631,14 @@ where stderr: entry.get_stderr(), }; - let filtered_outputs = if compilation.is_locally_preprocessed() { - // In this mode, cache entries are exclusively distinguished by their preprocessed - // source contents. But two files may differ in their names and / or the names of - // included files while still producing the same preprocessed output, so they get the - // same cache entry. That entry will have wrong (file names) dependency informaton in - // the dependency file except for the compilation unit that originally produced it. - // Since we did local preprocessing, that should already have produced the dependency - // file - just leave that one alone and don't overwrite it from the cache. - outputs - .iter() - .filter(|fobj_source| fobj_source.key != "d") // key "d" means dependency file - .cloned() - .collect() - } else { - // In this mode, no local preprocessing was done, so the dependency file (if any) - // has not been created. But in this mode, the cache key also includes a lot of - // information about filenames (and less relevant here, file hashes), so it *is* safe - // to restore the dependency file from the cache. - outputs.clone() - }; + let filtered_outputs = + filtered_restore_outputs(compilation.is_locally_preprocessed(), &outputs); - let hit = CompileResult::CacheHit(duration); match entry.extract_objects(filtered_outputs, &pool).await { - Ok(()) => Ok(CacheLookupResult::Success(hit, output)), + Ok(stats) => Ok(CacheLookupResult::Success( + CompileResult::CacheHit(duration, stats), + output, + )), Err(e) => { if e.downcast_ref::().is_some() { debug!("[{}]: Failed to decompress object", out_pretty); @@ -648,6 +649,35 @@ where } } } + (Ok(Ok(Cache::UncompressedHit(entry))), duration) => { + debug!( + "[{}]: Cache hit (uncompressed) in {}", + out_pretty, + fmt_duration_as_secs(&duration) + ); + let output = process::Output { + status: exit_status(0), + stdout: entry.get_stdout(), + stderr: entry.get_stderr(), + }; + + let filtered_outputs = + filtered_restore_outputs(compilation.is_locally_preprocessed(), &outputs); + + match entry.extract_objects(filtered_outputs, &pool).await { + Ok(stats) => Ok(CacheLookupResult::Success( + CompileResult::CacheHit(duration, stats), + output, + )), + Err(e) => { + debug!( + "[{}]: Failed to restore uncompressed object: {:?}", + out_pretty, e + ); + Ok(CacheLookupResult::Miss(MissType::CacheReadError)) + } + } + } (Ok(Ok(Cache::Miss)), duration) => { debug!( "[{}]: Cache miss in {}", @@ -773,30 +803,30 @@ where out_pretty, fmt_duration_as_secs(&duration_compilation) ); - let start_create_artifact = Instant::now(); - let mut entry = CacheWrite::from_objects(outputs, &pool) - .await - .context("failed to zip up compiler outputs")?; - - entry.put_stdout(&compiler_result.stdout)?; - entry.put_stderr(&compiler_result.stderr)?; - debug!( - "[{}]: Created cache artifact in {}", - out_pretty, - fmt_duration_as_secs(&start_create_artifact.elapsed()) - ); + // The compiler outputs still exist on disk; hand them to the backend directly so the + // disk cache can reflink them when `file_clone` is enabled (other backends zip+zstd). + let stdout = compiler_result.stdout.clone(); + let stderr = compiler_result.stderr.clone(); let out_pretty2 = out_pretty.clone(); // Try to finish storing the newly-written cache // entry. We'll get the result back elsewhere. let future = async move { let start = Instant::now(); - match storage.put(&key, entry).await { + match storage + .put_objects(&key, outputs, stdout, stderr, &pool) + .await + { Ok(_) => { - debug!("[{}]: Stored in cache successfully!", out_pretty2); + let elapsed = start.elapsed(); + debug!( + "[{}]: Stored in cache successfully in {}", + out_pretty2, + fmt_duration_as_secs(&elapsed) + ); Ok(CacheWriteInfo { object_file_pretty: out_pretty2, - duration: start.elapsed(), + duration: elapsed, }) } Err(e) => Err(e), @@ -1220,8 +1250,8 @@ pub struct CacheWriteInfo { pub enum CompileResult { /// An error made the compilation not possible. Error, - /// Result was found in cache. - CacheHit(Duration), + /// Result was found in cache, with counts of how objects were restored (reflinked vs copied). + CacheHit(Duration, ExtractionStats), /// Result was not found in cache. /// /// The `CacheWriteFuture` will resolve when the result is finished @@ -1254,7 +1284,7 @@ impl fmt::Debug for CompileResult { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { CompileResult::Error => write!(f, "CompileResult::Error"), - CompileResult::CacheHit(ref d) => write!(f, "CompileResult::CacheHit({:?})", d), + CompileResult::CacheHit(ref d, _) => write!(f, "CompileResult::CacheHit({:?})", d), CompileResult::CacheMiss(ref m, ref dt, ref d, _) => { write!(f, "CompileResult::CacheMiss({:?}, {:?}, {:?}, _)", d, m, dt) } @@ -1276,7 +1306,7 @@ impl PartialEq for CompileResult { fn eq(&self, other: &CompileResult) -> bool { match (self, other) { (&CompileResult::Error, &CompileResult::Error) => true, - (&CompileResult::CacheHit(_), &CompileResult::CacheHit(_)) => true, + (&CompileResult::CacheHit(..), &CompileResult::CacheHit(..)) => true, (CompileResult::CacheMiss(m, dt, _, _), CompileResult::CacheMiss(n, dt2, _, _)) => { m == n && dt == dt2 } @@ -1896,7 +1926,7 @@ where mod test { use super::*; use crate::cache::disk::DiskCache; - use crate::cache::{CacheMode, CacheRead}; + use crate::cache::{CacheMode, CacheRead, CacheWrite}; use crate::config::PreprocessorCacheModeConfig; use crate::mock_command::*; use crate::test::mock_storage::MockStorage; @@ -2556,6 +2586,7 @@ LLVM version: 6.0", }, CacheMode::ReadWrite, vec![], + false, ); // Write a dummy input file so the preprocessor cache mode can work std::fs::write(f.tempdir.path().join("foo.c"), "whatever").unwrap(); @@ -2661,7 +2692,10 @@ LLVM version: 6.0", .unwrap(); // Ensure that the object file was created. assert!(fs::metadata(&obj).map(|m| m.len() > 0).unwrap()); - assert_eq!(CompileResult::CacheHit(Duration::new(0, 0)), cached); + assert_eq!( + CompileResult::CacheHit(Duration::new(0, 0), ExtractionStats::default()), + cached + ); assert_eq!(exit_status(0), res.status); assert_eq!(COMPILER_STDOUT, res.stdout.as_slice()); assert_eq!(COMPILER_STDERR, res.stderr.as_slice()); @@ -2687,6 +2721,7 @@ LLVM version: 6.0", }, CacheMode::ReadWrite, vec![], + false, ); // Write a dummy input file so the preprocessor cache mode can work std::fs::write(f.tempdir.path().join("foo.c"), "whatever").unwrap(); @@ -2791,7 +2826,10 @@ LLVM version: 6.0", .unwrap(); // Ensure that the object file was created. assert!(fs::metadata(&obj).map(|m| m.len() > 0).unwrap()); - assert_eq!(CompileResult::CacheHit(Duration::new(0, 0)), cached); + assert_eq!( + CompileResult::CacheHit(Duration::new(0, 0), ExtractionStats::default()), + cached + ); assert_eq!(exit_status(0), res.status); assert_eq!(COMPILER_STDOUT, res.stdout.as_slice()); assert_eq!(COMPILER_STDERR, res.stderr.as_slice()); @@ -2965,13 +3003,97 @@ LLVM version: 6.0", )) .unwrap(); match cached { - CompileResult::CacheHit(duration) => { + CompileResult::CacheHit(duration, _) => { assert!(duration >= storage_delay); } _ => panic!("Unexpected compile result: {:?}", cached), } } + #[test_case(true ; "with preprocessor cache")] + #[test_case(false ; "without preprocessor cache")] + fn test_compiler_get_cached_or_compile_uncompressed_hit(preprocessor_cache_mode: bool) { + drop(env_logger::try_init()); + let creator = new_creator(); + let f = TestFixture::new(); + let gcc = f.mk_bin("gcc").unwrap(); + let runtime = Runtime::new().unwrap(); + let pool = runtime.handle().clone(); + std::fs::write(f.tempdir.path().join("foo.c"), "whatever").unwrap(); + let storage = Arc::new(MockStorage::new(None, preprocessor_cache_mode)); + let service = server::SccacheService::mock_with_storage(storage.clone(), pool.clone()); + next_command( + &creator, + Ok(MockChild::new(exit_status(0), "compiler_id=gcc", "")), + ); + let c = get_compiler_info( + creator.clone(), + &gcc, + f.tempdir.path(), + &[], + &[], + &pool, + None, + ) + .wait() + .unwrap() + .0; + next_command( + &creator, + Ok(MockChild::new(exit_status(0), "preprocessor output", "")), + ); + + const COMPILER_STDOUT: &[u8] = b"uncompressed stdout"; + const COMPILER_STDERR: &[u8] = b"uncompressed stderr"; + let obj_contents: &[u8] = &[9, 8, 7, 6, 5]; + + let entry_dir = f.tempdir.path().join("uentry"); + std::fs::create_dir_all(entry_dir.join(crate::cache::cache_io::OBJECTS_SUBDIR)).unwrap(); + std::fs::write( + entry_dir + .join(crate::cache::cache_io::OBJECTS_SUBDIR) + .join("obj"), + obj_contents, + ) + .unwrap(); + std::fs::write(entry_dir.join("stdout"), COMPILER_STDOUT).unwrap(); + std::fs::write(entry_dir.join("stderr"), COMPILER_STDERR).unwrap(); + std::fs::write(entry_dir.join(crate::lru_disk_cache::DIR_ENTRY_MARKER), b"").unwrap(); + let entry = crate::cache::UncompressedCacheEntry::new(entry_dir); + + let cwd = f.tempdir.path(); + let obj = cwd.join("foo.o"); + let arguments = ovec!["-c", "foo.c", "-o", "foo.o"]; + let mut hasher = match c.parse_arguments(&arguments, ".".as_ref(), &[]) { + CompilerArguments::Ok(h) => h, + o => panic!("Bad result from parse_arguments: {:?}", o), + }; + storage.next_get(Ok(Cache::UncompressedHit(entry))); + let (cached, res) = runtime + .block_on(hasher.get_cached_or_compile( + &service, + None, + creator, + storage, + arguments, + cwd.to_path_buf(), + vec![], + CacheControl::Default, + pool, + )) + .unwrap(); + match cached { + CompileResult::CacheHit(_, stats) => { + assert_eq!(stats.objects_reflinked + stats.objects_copied, 1); + } + _ => panic!("Unexpected compile result: {:?}", cached), + } + assert_eq!(exit_status(0), res.status); + assert_eq!(COMPILER_STDOUT, res.stdout.as_slice()); + assert_eq!(COMPILER_STDERR, res.stderr.as_slice()); + assert_eq!(fs::read(&obj).unwrap(), obj_contents); + } + #[test_case(true ; "with preprocessor cache")] #[test_case(false ; "without preprocessor cache")] fn test_compiler_get_cached_or_compile_force_recache(preprocessor_cache_mode: bool) { @@ -2991,6 +3113,7 @@ LLVM version: 6.0", }, CacheMode::ReadWrite, vec![], + false, ); let storage = Arc::new(storage); let service = server::SccacheService::mock_with_storage(storage.clone(), pool.clone()); @@ -3121,6 +3244,7 @@ LLVM version: 6.0", }, CacheMode::ReadWrite, vec![], + false, ); let storage = Arc::new(storage); let service = server::SccacheService::mock_with_storage(storage.clone(), pool.clone()); @@ -3220,6 +3344,7 @@ LLVM version: 6.0", }, CacheMode::ReadWrite, vec![], + false, ); let storage = Arc::new(storage); // Pretend to be GCC. diff --git a/src/config.rs b/src/config.rs index ecbe98b8d8..276232fe3e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -298,6 +298,9 @@ pub struct DiskCacheConfig { pub size: u64, pub preprocessor_cache_mode: PreprocessorCacheModeConfig, pub rw_mode: CacheModeConfig, + /// Store cache entries uncompressed and restore them via filesystem reflinks (copy-on-write) on + /// CoW filesystems, falling back to copies elsewhere. Defaults to `false`. + pub file_clone: bool, } impl Default for DiskCacheConfig { @@ -307,6 +310,7 @@ impl Default for DiskCacheConfig { size: default_disk_cache_size(), preprocessor_cache_mode: PreprocessorCacheModeConfig::activated(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, } } } @@ -1119,16 +1123,20 @@ fn config_from_env() -> Result { _ => (CacheModeConfig::ReadWrite, false), }; + let file_clone_overridden = bool_from_env_var("SCCACHE_FILE_CLONE")?; + let any_overridden = disk_dir.is_some() || disk_sz.is_some() || preprocessor_mode_overridden - || disk_rw_mode_overridden; + || disk_rw_mode_overridden + || file_clone_overridden.is_some(); let disk = if any_overridden { Some(DiskCacheConfig { dir: disk_dir.unwrap_or_else(default_disk_cache_dir), size: disk_sz.unwrap_or_else(default_disk_cache_size), preprocessor_cache_mode: preprocessor_mode_config, rw_mode: disk_rw_mode, + file_clone: file_clone_overridden.unwrap_or(false), }) } else { None @@ -1569,6 +1577,7 @@ fn config_overrides() { size: 5, preprocessor_cache_mode: Default::default(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, }), redis: Some(RedisCacheConfig { endpoint: Some("myotherredisurl".to_owned()), @@ -1591,6 +1600,7 @@ fn config_overrides() { size: 15, preprocessor_cache_mode: Default::default(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, }), memcached: Some(MemcachedCacheConfig { url: "memurl".to_owned(), @@ -1634,6 +1644,7 @@ fn config_overrides() { size: 5, preprocessor_cache_mode: Default::default(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, }), memcached: Some(MemcachedCacheConfig { url: "memurl".to_owned(), @@ -1657,6 +1668,7 @@ fn config_overrides() { size: 5, preprocessor_cache_mode: Default::default(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, }, dist: Default::default(), server_startup_timeout: None, @@ -2207,6 +2219,72 @@ fn test_gcs_service_account() { } } +#[test] +#[serial(config_from_env)] +fn test_file_clone_from_env() { + temp_env::with_vars( + vec![ + ("SCCACHE_FILE_CLONE", Some("true")), + ("SCCACHE_DIRECT", None::<&str>), + ], + || { + let disk = config_from_env() + .unwrap() + .cache + .disk + .expect("file_clone override should produce a disk config"); + assert!(disk.file_clone); + assert_eq!( + disk.preprocessor_cache_mode, + PreprocessorCacheModeConfig::activated() + ); + }, + ); +} + +#[test] +#[serial(config_from_env)] +fn test_file_clone_from_env_disabled() { + temp_env::with_vars( + vec![ + ("SCCACHE_FILE_CLONE", Some("false")), + ("SCCACHE_DIRECT", None::<&str>), + ], + || { + let disk = config_from_env() + .unwrap() + .cache + .disk + .expect("file_clone override should produce a disk config"); + assert!(!disk.file_clone); + }, + ); +} + +#[test] +fn test_file_clone_toml_roundtrip() { + const CONFIG_STR: &str = r#" +[cache.disk] +dir = "/tmp/file_clone_cache" +size = 1024 +file_clone = true +"#; + let file_config: FileConfig = toml::from_str(CONFIG_STR).expect("Is valid toml."); + let disk = file_config.cache.disk.expect("disk config present"); + assert!(disk.file_clone); + assert_eq!( + disk.preprocessor_cache_mode, + PreprocessorCacheModeConfig::activated() + ); + assert_eq!(disk.rw_mode, CacheModeConfig::ReadWrite); + + let serialized = toml::to_string(&disk).unwrap(); + assert!(serialized.contains("file_clone = true")); + + let without: DiskCacheConfig = toml::from_str("dir = \"/tmp/x\"\nsize = 1024\n").unwrap(); + assert!(!without.file_clone); +} + #[test] fn full_toml_parse() { const CONFIG_STR: &str = r#" @@ -2304,6 +2382,7 @@ key_prefix = "cosprefix" size: 7 * 1024 * 1024 * 1024, preprocessor_cache_mode: PreprocessorCacheModeConfig::activated(), rw_mode: CacheModeConfig::ReadWrite, + file_clone: false, }), gcs: Some(GCSCacheConfig { bucket: "bucket".to_owned(), @@ -2446,7 +2525,7 @@ fn server_toml_parse() { }, toolchain_cache_size: 10737418240, } - ) + ); } #[test] diff --git a/src/lib.rs b/src/lib.rs index d5aa6a6032..1ad6eb8d2e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,6 +45,7 @@ pub mod lru_disk_cache; mod mock_command; mod net; mod protocol; +mod reflink; pub mod server; #[doc(hidden)] pub mod util; diff --git a/src/lru_disk_cache/mod.rs b/src/lru_disk_cache/mod.rs index 65f7447616..2f0fc3e549 100644 --- a/src/lru_disk_cache/mod.rs +++ b/src/lru_disk_cache/mod.rs @@ -4,6 +4,7 @@ use fs::File; use fs_err as fs; use std::borrow::Borrow; use std::boxed::Box; +use std::collections::HashSet; use std::collections::hash_map::RandomState; use std::error::Error as StdError; use std::ffi::{OsStr, OsString}; @@ -12,6 +13,7 @@ use std::hash::BuildHasher; use std::io; use std::io::prelude::*; use std::path::{Path, PathBuf}; +use std::time::SystemTime; use filetime::{FileTime, set_file_times}; pub use lru_cache::{LruCache, Meter}; @@ -20,7 +22,29 @@ use walkdir::WalkDir; use crate::util::OsStrExt; -const TEMPFILE_PREFIX: &str = ".sccachetmp"; +pub(crate) const TEMPFILE_PREFIX: &str = ".sccachetmp"; + +/// Marker file identifying a directory at the cache key depth as a finished `file_clone` entry. +pub const DIR_ENTRY_MARKER: &str = ".sccache_dir_entry"; + +fn dir_content_size(path: &Path) -> u64 { + WalkDir::new(path) + .into_iter() + .filter_map(std::result::Result::ok) + .filter(|e| e.file_type().is_file()) + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len()) + .sum() +} + +/// Whether a removal error means the entry is already gone (removed out-of-band, or `ENOTDIR` from +/// a stale inner-file record after `file_clone` was toggled off) and is safe to ignore. +fn is_entry_already_gone(e: &io::Error) -> bool { + matches!( + e.kind(), + io::ErrorKind::NotFound | io::ErrorKind::NotADirectory + ) +} struct FileSize; @@ -61,12 +85,21 @@ fn get_all_files>(path: P) -> Box { lru: LruCache, root: PathBuf, pending: Vec, pending_size: u64, + support_dir_entries: bool, + dir_entries: HashSet, } /// Errors returned by this crate. @@ -141,6 +174,15 @@ impl LruDiskCache { /// The cache is not observant of changes to files under `path` from external sources, it /// expects to have sole maintence of the contents. pub fn new(path: T, size: u64) -> Result + where + PathBuf: From, + { + Self::new_with_dir_entries(path, size, false) + } + + /// Like [`LruDiskCache::new`], but `support_dir_entries` enables recognition, sizing and + /// eviction of uncompressed directory cache entries (used by the `file_clone` disk cache). + pub fn new_with_dir_entries(path: T, size: u64, support_dir_entries: bool) -> Result where PathBuf: From, { @@ -149,6 +191,8 @@ impl LruDiskCache { root: PathBuf::from(path), pending: vec![], pending_size: 0, + support_dir_entries, + dir_entries: HashSet::new(), } .init() } @@ -182,9 +226,16 @@ impl LruDiskCache { self.root.join(rel_path) } - /// Scan `self.root` for existing files and store them. - fn init(mut self) -> Result { + fn init(self) -> Result { fs::create_dir_all(&self.root)?; + if self.support_dir_entries { + self.init_with_dir_entries() + } else { + self.init_files_only() + } + } + + fn init_files_only(mut self) -> Result { for (file, size) in get_all_files(&self.root) { if file .file_name() @@ -209,6 +260,165 @@ impl LruDiskCache { Ok(self) } + fn init_with_dir_entries(mut self) -> Result { + self.remove_temp_entries(); + + let (mut entries, orphans) = self.scan_entries(); + + for orphan in orphans { + warn!( + "Removing orphan cache directory without marker: {}", + orphan.display() + ); + fs::remove_dir_all(&orphan).unwrap_or_else(|e| { + error!( + "Error removing orphan directory `{}`: {}", + orphan.display(), + e + ); + }); + } + + entries.sort_by_key(|e| e.mtime); + for ScannedEntry { + path, size, is_dir, .. + } in entries + { + if !self.can_store(size) { + let res = if is_dir { + fs::remove_dir_all(&path) + } else { + fs::remove_file(&path) + }; + res.unwrap_or_else(|e| { + error!( + "Error removing entry `{}` which is too large for the cache ({} bytes): {}", + path.display(), + size, + e + ); + }); + } else { + let rel = path + .strip_prefix(&self.root) + .expect("Bad path?") + .as_os_str() + .to_owned(); + match self.add_file(AddFile::RelPath(rel.as_os_str()), size) { + Ok(()) => { + if is_dir { + self.dir_entries.insert(rel); + } + } + Err(e) => error!("Error adding entry: {}", e), + } + } + } + Ok(self) + } + + fn remove_temp_entries(&self) { + let read_dir = match fs::read_dir(&self.root) { + Ok(rd) => rd, + Err(e) => { + error!( + "Error reading cache directory `{}`: {}", + self.root.display(), + e + ); + return; + } + }; + for entry in read_dir.filter_map(std::result::Result::ok) { + if !entry.file_name().starts_with(TEMPFILE_PREFIX) { + continue; + } + let path = entry.path(); + let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false); + let res = if is_dir { + fs::remove_dir_all(&path) + } else { + fs::remove_file(&path) + }; + res.unwrap_or_else(|e| { + error!("Error removing temporary entry `{}`: {}", path.display(), e); + }); + } + } + + fn scan_entries(&self) -> (Vec, Vec) { + let preprocessor_dir = self.root.join("preprocessor"); + let root_depth = self.root.components().count(); + let mut entries: Vec = Vec::new(); + let mut orphans: Vec = Vec::new(); + + let mut walker = WalkDir::new(&self.root).min_depth(1).into_iter(); + loop { + let entry = match walker.next() { + None => break, + Some(Ok(e)) => e, + Some(Err(_)) => continue, + }; + let path = entry.path(); + let is_dir = entry.file_type().is_dir(); + + // Prune the preprocessor subtree (owned by the sibling cache) and any temp entries. + if (is_dir && path == preprocessor_dir) + || entry.file_name().starts_with(TEMPFILE_PREFIX) + { + if is_dir { + walker.skip_current_dir(); + } + continue; + } + + if is_dir { + let depth = path.components().count() - root_depth; + if depth < 3 { + continue; + } + if depth == 3 { + if path.join(DIR_ENTRY_MARKER).exists() { + if let Ok(meta) = entry.metadata() { + if let Ok(mtime) = meta.modified() { + let size = dir_content_size(path); + entries.push(ScannedEntry { + mtime, + path: path.to_owned(), + size, + is_dir: true, + }); + } + } + } else { + let has_direct_files = fs::read_dir(path) + .map(|rd| { + rd.filter_map(std::result::Result::ok) + .any(|e| e.file_type().map(|t| t.is_file()).unwrap_or(false)) + }) + .unwrap_or(false); + if has_direct_files { + orphans.push(path.to_owned()); + } + } + } + walker.skip_current_dir(); + } else if entry.file_type().is_file() { + if let Ok(meta) = entry.metadata() { + if let Ok(mtime) = meta.modified() { + entries.push(ScannedEntry { + mtime, + path: path.to_owned(), + size: meta.len(), + is_dir: false, + }); + } + } + } + } + (entries, orphans) + } + /// Returns `true` if the disk cache can store a file of `size` bytes. pub fn can_store(&self, size: u64) -> bool { size <= self.lru.capacity() @@ -221,21 +431,27 @@ impl LruDiskCache { //TODO: ideally LRUCache::insert would give us back the entries it had to remove. while self.size() + size > self.capacity() { let (rel_path, _) = self.lru.remove_lru().expect("Unexpectedly empty cache!"); - let remove_path = self.rel_to_abs_path(rel_path); + let remove_path = self.rel_to_abs_path(&rel_path); + let is_dir = self.dir_entries.remove(&rel_path); //TODO: check that files are removable during `init`, so that this is only // due to outside interference. - fs::remove_file(&remove_path).unwrap_or_else(|e| { - // Sometimes the file has already been removed + let res = if is_dir { + fs::remove_dir_all(&remove_path) + } else { + fs::remove_file(&remove_path) + }; + res.unwrap_or_else(|e| { + // Sometimes the entry has already been removed // this seems to happen when the max cache size has been reached // https://github.com/mozilla/sccache/issues/2092 - if e.kind() == std::io::ErrorKind::NotFound { + if is_entry_already_gone(&e) { debug!( - "Error removing file from cache as it was not found: `{:?}`", - remove_path + "Error removing entry from cache as it is already gone: `{:?}`: {}", + remove_path, e ); } else { panic!( - "Error removing file from cache: `{:?}`: {}, {:?}", + "Error removing entry from cache: `{:?}`: {}, {:?}", remove_path, e, e.kind() @@ -361,11 +577,42 @@ impl LruDiskCache { self.pending_size -= size; let path = self.rel_to_abs_path(&key); fs::create_dir_all(path.parent().unwrap())?; - file.persist(path).map_err(|e| e.error)?; + match file.persist(&path) { + Ok(_) => {} + Err(persist_err) => { + if path.is_dir() { + self.remove_entry_and_strands(&key); + fs::remove_dir_all(&path)?; + persist_err.file.persist(&path).map_err(|e| e.error)?; + } else { + return Err(persist_err.error.into()); + } + } + } self.lru.insert(key, real_size); Ok(()) } + fn remove_entry_and_strands(&mut self, key: &OsStr) { + self.dir_entries.remove(key); + self.lru.remove(key); + let key_path = Path::new(key); + let strands: Vec = self + .lru + .iter() + .map(|(k, _)| k) + .filter(|k| { + let kp = Path::new(k.as_os_str()); + kp != key_path && kp.starts_with(key_path) + }) + .cloned() + .collect(); + for strand in strands { + self.dir_entries.remove(&strand); + self.lru.remove(&strand); + } + } + /// Return `true` if a file with path `key` is in the cache. Entries created /// by `LruDiskCache::prepare_add` but not yet committed return `false`. pub fn contains_key>(&self, key: K) -> bool { @@ -402,14 +649,86 @@ impl LruDiskCache { match self.lru.remove(key.as_ref()) { Some(_) => { let path = self.rel_to_abs_path(key.as_ref()); - fs::remove_file(&path).map_err(|e| { - error!("Error removing file from cache: `{:?}`: {}", path, e); - Into::into(e) - }) + let res = if self.dir_entries.remove(key.as_ref()) { + fs::remove_dir_all(&path) + } else { + fs::remove_file(&path) + }; + match res { + Ok(()) => Ok(()), + Err(e) if is_entry_already_gone(&e) => { + debug!("Entry `{:?}` was already gone on remove: {}", path, e); + Ok(()) + } + Err(e) => { + error!("Error removing entry from cache: `{:?}`: {}", path, e); + Err(e.into()) + } + } } None => Ok(()), } } + + /// Return `true` if `key` is registered as a directory (uncompressed) cache entry. + pub fn contains_dir_key>(&self, key: K) -> bool { + self.dir_entries.contains(key.as_ref()) + } + + /// Update the LRU recency of an entry without opening it. `Ok(true)` if it was present. + pub fn touch>(&mut self, key: K) -> Result { + let rel_path = key.as_ref(); + if self.lru.get(rel_path).is_some() { + let path = self.rel_to_abs_path(rel_path); + let t = FileTime::now(); + set_file_times(&path, t, t).unwrap_or_else(|e| { + debug!("Failed to update mtime for {:?}: {}", path, e); + }); + Ok(true) + } else { + Ok(false) + } + } + + /// Atomically install the fully-populated `staging_dir` (which must already contain the entry's + /// files and the marker) as the directory cache entry for `key`, replacing any existing entry. + pub fn insert_dir>(&mut self, key: K, staging_dir: &Path) -> Result<()> { + let rel_path = key.as_ref().to_owned(); + let size = dir_content_size(staging_dir); + if !self.can_store(size) { + return Err(Error::FileTooLarge); + } + self.remove_any_entry(&rel_path); + self.make_space(size)?; + let final_path = self.rel_to_abs_path(&rel_path); + let parent = final_path.parent().expect("Bad path?"); + fs::create_dir_all(parent)?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = fs::set_permissions(parent, std::fs::Permissions::from_mode(0o700)); + } + fs::rename(staging_dir, &final_path)?; + self.dir_entries.insert(rel_path.clone()); + self.lru.insert(rel_path, size); + Ok(()) + } + + fn remove_any_entry(&mut self, rel_path: &OsStr) { + let was_dir = self.dir_entries.remove(rel_path); + self.lru.remove(rel_path); + let path = self.rel_to_abs_path(rel_path); + let res = if was_dir || path.is_dir() { + fs::remove_dir_all(&path) + } else { + fs::remove_file(&path) + }; + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + error!("Error removing existing entry `{}`: {}", path.display(), e); + } + } + } } #[cfg(test)] @@ -729,4 +1048,340 @@ mod tests { assert!(!f.tmp().join("cache").join("file2").exists()); assert!(!p4.exists()); } + + fn make_staging_dir(root: &Path, name: &str, files: &[(&str, usize)]) -> PathBuf { + let staging = root.join(format!("{}{}", super::TEMPFILE_PREFIX, name)); + fs::create_dir_all(&staging).unwrap(); + for (fname, size) in files { + fs::write(staging.join(fname), vec![0u8; *size]).unwrap(); + } + fs::write(staging.join(super::DIR_ENTRY_MARKER), b"").unwrap(); + staging + } + + #[test] + fn test_insert_dir_entry_size_and_reinit() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let key = Path::new("a").join("b").join("abcdef"); + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + let staging = make_staging_dir(&cache_dir, "s1", &[("obj", 20), ("d", 5)]); + c.insert_dir(&key, &staging).unwrap(); + assert!(c.contains_key(&key)); + assert!(c.contains_dir_key(&key)); + assert_eq!(c.size(), 25); + assert!(cache_dir.join(&key).is_dir()); + assert!(cache_dir.join(&key).join(super::DIR_ENTRY_MARKER).exists()); + assert_eq!( + fs::read(cache_dir.join(&key).join("obj")).unwrap().len(), + 20 + ); + assert!(!staging.exists()); + } + let c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + assert!(c.contains_key(&key)); + assert!(c.contains_dir_key(&key)); + assert_eq!(c.size(), 25); + } + + #[test] + fn test_dir_entry_eviction_uses_remove_dir_all() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 60, true).unwrap(); + let k1 = Path::new("a").join("a").join("k1"); + let k2 = Path::new("b").join("b").join("k2"); + let k3 = Path::new("c").join("c").join("k3"); + c.insert_dir(&k1, &make_staging_dir(&cache_dir, "s1", &[("obj", 30)])) + .unwrap(); + c.insert_dir(&k2, &make_staging_dir(&cache_dir, "s2", &[("obj", 30)])) + .unwrap(); + assert_eq!(c.size(), 60); + c.insert_dir(&k3, &make_staging_dir(&cache_dir, "s3", &[("obj", 30)])) + .unwrap(); + assert_eq!(c.size(), 60); + assert!(!c.contains_key(&k1)); + assert!( + !cache_dir.join(&k1).exists(), + "evicted directory entry must be fully removed" + ); + assert!(c.contains_key(&k2)); + assert!(c.contains_key(&k3)); + } + + #[test] + fn test_preprocessor_subtree_untouched_on_dir_entry_init() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let compressed = Path::new("a").join("b").join("compkey"); + let preproc = Path::new("preprocessor") + .join("c") + .join("d") + .join("e") + .join("ppkey"); + fs::create_dir_all(cache_dir.join(&compressed).parent().unwrap()).unwrap(); + fs::write(cache_dir.join(&compressed), vec![0u8; 10]).unwrap(); + fs::create_dir_all(cache_dir.join(&preproc).parent().unwrap()).unwrap(); + fs::write(cache_dir.join(&preproc), vec![0u8; 10]).unwrap(); + + let c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + assert!(c.contains_key(&compressed), "compressed entry is tracked"); + assert!( + cache_dir.join(&preproc).exists(), + "preprocessor file must NOT be deleted" + ); + assert!( + !c.contains_key(&preproc), + "preprocessor subtree is pruned from the object cache, not tracked" + ); + } + + #[test] + fn test_orphan_dir_cleanup_on_init() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let comp = Path::new("a").join("b").join("validkey"); + let good = Path::new("e").join("f").join("goodkey"); + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_bytes(&comp, &[0u8; 10]).unwrap(); + c.insert_dir(&good, &make_staging_dir(&cache_dir, "good", &[("obj", 10)])) + .unwrap(); + } + let orphan = cache_dir.join("c").join("d").join("orphankey"); + fs::create_dir_all(&orphan).unwrap(); + fs::write(orphan.join("obj"), vec![0u8; 10]).unwrap(); + + let c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + assert!(c.contains_key(&comp), "compressed entry survives reinit"); + assert!( + c.contains_key(&good), + "marker directory entry survives reinit" + ); + assert!(c.contains_dir_key(&good)); + assert!( + !orphan.exists(), + "marker-less orphan directory at key depth is removed" + ); + assert!(cache_dir.join(&comp).exists()); + } + + #[test] + fn test_compressed_entries_survive_dir_entry_reinit() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let key1 = Path::new("a").join("b").join("abcdef1234"); + let key2 = Path::new("a").join("b").join("abcdef5678"); + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_bytes(&key1, &[1; 10]).unwrap(); + c.insert_bytes(&key2, &[2; 10]).unwrap(); + assert_eq!(c.len(), 2); + } + let c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + assert!(c.contains_key(&key1)); + assert!(c.contains_key(&key2)); + assert_eq!(c.len(), 2); + assert_eq!(c.size(), 20); + } + + #[test] + fn test_remove_directory_entry() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + let key = Path::new("a").join("b").join("dirkey"); + c.insert_dir(&key, &make_staging_dir(&cache_dir, "rm", &[("obj", 20)])) + .unwrap(); + assert!(c.contains_key(&key) && c.contains_dir_key(&key)); + assert_eq!(c.size(), 20); + + c.remove(&key).unwrap(); + assert!(!c.contains_key(&key)); + assert!(!c.contains_dir_key(&key)); + assert!(!cache_dir.join(&key).exists()); + assert_eq!(c.size(), 0); + } + + #[test] + fn test_remove_temp_entries_on_init() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + fs::create_dir_all(&cache_dir).unwrap(); + let prefix = super::TEMPFILE_PREFIX; + let tmp_file = cache_dir.join(format!("{prefix}leftover")); + fs::write(&tmp_file, b"junk").unwrap(); + let tmp_dir = cache_dir.join(format!("{prefix}dir")); + fs::create_dir_all(&tmp_dir).unwrap(); + fs::write(tmp_dir.join("inner"), b"junk").unwrap(); + let real = Path::new("a").join("b").join("realkey"); + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_bytes(&real, &[3; 10]).unwrap(); + // Re-plant temp leftovers after init (insert_bytes doesn't remove them). + fs::write(&tmp_file, b"junk").unwrap(); + fs::create_dir_all(&tmp_dir).unwrap(); + fs::write(tmp_dir.join("inner"), b"junk").unwrap(); + } + let c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + assert!(!tmp_file.exists(), "leftover temp file removed on init"); + assert!(!tmp_dir.exists(), "leftover temp dir removed on init"); + assert!(c.contains_key(&real)); + } + + #[test] + fn test_dir_entry_small_branches() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 100, true).unwrap(); + + assert!(!c.touch(Path::new("a").join("b").join("nope")).unwrap()); + + let big = make_staging_dir(&cache_dir, "big", &[("obj", 200)]); + let key = Path::new("a").join("b").join("big"); + assert!(matches!(c.insert_dir(&key, &big), Err(Error::FileTooLarge))); + + let key2 = Path::new("c").join("d").join("dup"); + c.insert_dir(&key2, &make_staging_dir(&cache_dir, "d1", &[("obj", 10)])) + .unwrap(); + c.insert_dir(&key2, &make_staging_dir(&cache_dir, "d2", &[("obj", 20)])) + .unwrap(); + assert!(c.contains_dir_key(&key2)); + assert_eq!(c.size(), 20, "second insert replaced the first"); + assert!(c.touch(&key2).unwrap()); + } + + #[test] + fn test_toggle_file_clone_off_commit_cleans_strands() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let key = Path::new("a").join("b").join("togglekey"); + + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_dir( + &key, + &make_staging_dir(&cache_dir, "t", &[("obj", 30), ("d", 10)]), + ) + .unwrap(); + assert!(c.contains_dir_key(&key)); + } + + // Reopen with file_clone off: init_files_only registers the inner files as strands (`/obj`). + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, false).unwrap(); + let strand = key.join("obj"); + assert!( + c.contains_key(&strand), + "inner-file strand is tracked after toggle-off" + ); + + let mut tmp = c.prepare_add(&key, 10).unwrap(); + tmp.as_file_mut().write_all(&[7u8; 10]).unwrap(); + c.commit(tmp).unwrap(); + + assert!( + cache_dir.join(&key).is_file(), + "key is now a compressed file" + ); + assert!(!c.contains_dir_key(&key)); + assert!(c.contains_key(&key)); + assert!( + !c.contains_key(&strand), + "stranded inner-file record removed" + ); + assert_eq!(read_all(&mut c.get(&key).unwrap()).unwrap(), vec![7u8; 10]); + + c.insert_bytes(Path::new("x").join("y").join("z"), &[0u8; 995]) + .unwrap(); + assert!(c.contains_key(Path::new("x").join("y").join("z"))); + assert!(!c.contains_key(&key), "the old entry was evicted"); + } + + #[test] + fn test_evict_strand_under_file_key_no_panic() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let key = Path::new("a").join("b").join("k"); + + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 100, true).unwrap(); + c.insert_dir(&key, &make_staging_dir(&cache_dir, "s", &[("obj", 80)])) + .unwrap(); + } + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 100, false).unwrap(); + assert!(c.contains_key(key.join("obj"))); + + // Replace the key dir with a file so the strand's parent is now a file → ENOTDIR on eviction. + fs::remove_dir_all(cache_dir.join(&key)).unwrap(); + fs::write(cache_dir.join(&key), [0u8; 5]).unwrap(); + + c.insert_bytes(Path::new("c").join("d").join("e"), &[1u8; 90]) + .unwrap(); + assert!(c.contains_key(Path::new("c").join("d").join("e"))); + } + + #[test] + fn test_remove_strand_under_file_key_is_ok() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + let key = Path::new("a").join("b").join("k"); + + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_dir(&key, &make_staging_dir(&cache_dir, "s", &[("obj", 20)])) + .unwrap(); + } + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, false).unwrap(); + let strand = key.join("obj"); + assert!(c.contains_key(&strand)); + + // Replace the key dir with a file so the strand's parent is now a file → ENOTDIR on remove. + fs::remove_dir_all(cache_dir.join(&key)).unwrap(); + fs::write(cache_dir.join(&key), [0u8; 5]).unwrap(); + + c.remove(&strand).unwrap(); + assert!( + !c.contains_key(&strand), + "strand record dropped after remove" + ); + } + + #[test] + fn test_remove_entry_and_strands_prefix_precision() { + let f = TestFixture::new(); + let cache_dir = f.tmp().join("cache"); + // Both keys live under `a/b/`; "key2" as a string starts with "key". + let key = Path::new("a").join("b").join("key"); + let key2 = Path::new("a").join("b").join("key2"); + + { + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, true).unwrap(); + c.insert_dir(&key, &make_staging_dir(&cache_dir, "a", &[("obj", 10)])) + .unwrap(); + c.insert_dir(&key2, &make_staging_dir(&cache_dir, "b", &[("obj", 10)])) + .unwrap(); + } + + let mut c = LruDiskCache::new_with_dir_entries(&cache_dir, 1000, false).unwrap(); + let strand = key.join("obj"); + let sibling_strand = key2.join("obj"); + assert!(c.contains_key(&strand)); + assert!(c.contains_key(&sibling_strand)); + + let mut tmp = c.prepare_add(&key, 5).unwrap(); + tmp.as_file_mut().write_all(&[7u8; 5]).unwrap(); + c.commit(tmp).unwrap(); + + assert!(cache_dir.join(&key).is_file(), "target replaced by a file"); + assert!(!c.contains_key(&strand), "target strand dropped"); + assert!( + c.contains_key(&sibling_strand), + "sibling strand must survive (component-wise prefix precision)" + ); + assert!( + cache_dir.join(&key2).join("obj").exists(), + "sibling entry's files untouched on disk" + ); + } } diff --git a/src/reflink.rs b/src/reflink.rs new file mode 100644 index 0000000000..1acd7a5c89 --- /dev/null +++ b/src/reflink.rs @@ -0,0 +1,452 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Filesystem reflink (copy-on-write) helpers for the `file_clone` disk cache mode. + +use std::fs; +use std::io; +use std::path::Path; + +#[cfg(target_os = "linux")] +use std::fs::File; + +use tempfile::NamedTempFile; + +#[cfg(target_os = "linux")] +use std::collections::HashSet; +#[cfg(target_os = "linux")] +use std::sync::{LazyLock, Mutex}; + +/// Whether a file was reflinked (shared blocks) or copied. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReflinkOutcome { + Reflinked, + Copied(u64), +} + +impl ReflinkOutcome { + /// `true` when the data was reflinked rather than copied. + pub fn reflinked(self) -> bool { + matches!(self, ReflinkOutcome::Reflinked) + } +} + +/// Probe whether the filesystem backing `dir` supports reflinking. +pub fn is_reflink_supported(dir: &Path) -> bool { + let Ok(temp_dir) = tempfile::tempdir_in(dir) else { + return false; + }; + let src = temp_dir.path().join("reflink_probe_src"); + let dst = temp_dir.path().join("reflink_probe_dst"); + if fs::write(&src, b"sccache reflink probe").is_err() { + return false; + } + reflink_copy::reflink(&src, &dst).is_ok() +} + +/// Reflink (or copy) `src` to a new file `dest` (which must not exist), optionally setting `mode`. +/// Returns whether the data was reflinked or copied. +pub fn reflink_or_copy_new( + src: &Path, + dest: &Path, + mode: Option, +) -> io::Result { + #[cfg(target_os = "linux")] + { + let src_file = File::open(src)?; + let dst_file = fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(dest)?; + let outcome = clone_or_copy_fd(&src_file, &dst_file)?; + apply_fd_mode(&dst_file, dest, mode); + Ok(outcome) + } + #[cfg(not(target_os = "linux"))] + { + // Enforce the "dest must not exist" contract on every platform: where the filesystem + // doesn't support reflinks, the crate would otherwise fall back to a copy that + // overwrites an existing dest (Linux gets this for free via `create_new`). + if dest.try_exists()? { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + "destination already exists", + )); + } + crate_reflink_or_copy(src, dest, mode) + } +} + +/// Reflink (or copy) `src` to `dest`, atomically replacing any existing file, optionally setting +/// `mode`. Returns whether the data was reflinked or copied. +pub fn reflink_or_copy_atomic( + src: &Path, + dest: &Path, + mode: Option, +) -> io::Result { + let dest_dir = dest.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "destination path has no parent directory", + ) + })?; + + #[cfg(target_os = "linux")] + { + let tmp = NamedTempFile::new_in(dest_dir)?; + let src_file = open_src_nofollow(src)?; + let outcome = clone_or_copy_fd(&src_file, tmp.as_file())?; + apply_fd_mode(tmp.as_file(), dest, mode); + tmp.persist(dest).map_err(|e| e.error)?; + Ok(outcome) + } + #[cfg(not(target_os = "linux"))] + { + let tmp_path = NamedTempFile::new_in(dest_dir)?.into_temp_path(); + fs::remove_file(&tmp_path)?; + let outcome = crate_reflink_or_copy(src, &tmp_path, mode)?; + tmp_path.persist(dest).map_err(|e| e.error)?; + Ok(outcome) + } +} + +/// Reflink (or copy) `src` directly onto `dest` in place (non-atomic fallback for when a temp file +/// cannot be staged in the destination directory). +pub fn reflink_or_copy_direct( + src: &Path, + dest: &Path, + mode: Option, +) -> io::Result { + #[cfg(target_os = "linux")] + { + let src_file = open_src_nofollow(src)?; + let dst_file = fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest)?; + let outcome = clone_or_copy_fd(&src_file, &dst_file)?; + apply_fd_mode(&dst_file, dest, mode); + Ok(outcome) + } + #[cfg(not(target_os = "linux"))] + { + // The destination dir may be non-writable (that's why the atomic temp path failed), so + // overwrite the existing file in place; clonefile can't target an existing file → plain copy. + let mut reader = fs::File::open(src)?; + let mut writer = fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest)?; + let n = io::copy(&mut reader, &mut writer)?; + set_path_mode(dest, mode); + Ok(ReflinkOutcome::Copied(n)) + } +} + +#[cfg(target_os = "linux")] +fn open_src_nofollow(src: &Path) -> io::Result { + use std::os::unix::fs::OpenOptionsExt; + fs::OpenOptions::new() + .read(true) + .custom_flags(libc::O_NOFOLLOW) + .open(src) +} + +#[cfg(target_os = "linux")] +fn apply_fd_mode(file: &File, dest: &Path, mode: Option) { + if let Some(mode) = mode { + if let Err(e) = set_fd_mode(file, mode) { + debug!( + "Failed to set mode {:#o} on {}: {}", + mode, + dest.display(), + e + ); + } + } +} + +#[cfg(all(unix, not(target_os = "linux")))] +fn set_path_mode(path: &Path, mode: Option) { + use std::os::unix::fs::PermissionsExt; + if let Some(mode) = mode { + if let Err(e) = fs::set_permissions(path, std::fs::Permissions::from_mode(mode)) { + debug!( + "Failed to set mode {:#o} on {}: {}", + mode, + path.display(), + e + ); + } + } +} + +#[cfg(not(unix))] +fn set_path_mode(_path: &Path, _mode: Option) {} + +#[cfg(not(target_os = "linux"))] +fn crate_reflink_or_copy(src: &Path, dest: &Path, mode: Option) -> io::Result { + let outcome = match reflink_copy::reflink_or_copy(src, dest) { + Ok(None) => ReflinkOutcome::Reflinked, + Ok(Some(n)) => ReflinkOutcome::Copied(n), + Err(e) => return Err(e), + }; + set_path_mode(dest, mode); + Ok(outcome) +} + +#[cfg(target_os = "linux")] +fn clone_or_copy_fd(src: &File, dst: &File) -> io::Result { + use std::os::unix::fs::MetadataExt; + + let dev = dst.metadata().ok().map(|m| m.dev()); + if dev.map(device_known_unsupported).unwrap_or(false) { + return Ok(ReflinkOutcome::Copied(copy_fd(src, dst)?)); + } + match ficlone(dst, src) { + Ok(()) => Ok(ReflinkOutcome::Reflinked), + Err(e) => { + remember_unsupported(dev, &e); + Ok(ReflinkOutcome::Copied(copy_fd(src, dst)?)) + } + } +} + +#[cfg(target_os = "linux")] +fn ficlone(dst: &File, src: &File) -> io::Result<()> { + use std::os::unix::io::AsRawFd; + // SAFETY: both descriptors are valid and owned by `dst`/`src` for the duration of the call. + let ret = unsafe { libc::ioctl(dst.as_raw_fd(), libc::FICLONE, src.as_raw_fd()) }; + if ret == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } +} + +#[cfg(target_os = "linux")] +fn copy_fd(src: &File, dst: &File) -> io::Result { + use std::io::{Seek, SeekFrom}; + dst.set_len(0)?; + let mut reader: &File = src; + let mut writer: &File = dst; + (&mut reader).seek(SeekFrom::Start(0))?; + (&mut writer).seek(SeekFrom::Start(0))?; + io::copy(&mut reader, &mut writer) +} + +#[cfg(target_os = "linux")] +fn set_fd_mode(file: &File, mode: u32) -> io::Result<()> { + use std::os::unix::io::AsRawFd; + // SAFETY: `file` owns a valid descriptor for the duration of the call. + let ret = unsafe { libc::fchmod(file.as_raw_fd(), mode) }; + if ret == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } +} + +#[cfg(target_os = "linux")] +static UNSUPPORTED_DEVICES: LazyLock>> = + LazyLock::new(|| Mutex::new(HashSet::new())); + +#[cfg(target_os = "linux")] +fn device_known_unsupported(dev: u64) -> bool { + UNSUPPORTED_DEVICES.lock().unwrap().contains(&dev) +} + +#[cfg(target_os = "linux")] +fn remember_unsupported(dev: Option, err: &io::Error) { + // EXDEV (cross-filesystem) is intentionally not memoised: it's a property of this src/dst pair, + // not of the device's reflink capability. + let unsupported = matches!(err.kind(), io::ErrorKind::Unsupported) + || matches!( + err.raw_os_error(), + Some(libc::EOPNOTSUPP) | Some(libc::ENOTTY) + ); + if !unsupported { + return; + } + if let Some(dev) = dev { + let newly = UNSUPPORTED_DEVICES.lock().unwrap().insert(dev); + if newly { + warn!( + "file_clone: reflink not supported on destination filesystem (device {dev}): {err}. \ + Falling back to copies; restored files will not share disk blocks with the cache." + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn test_reflink_or_copy_new_preserves_content() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + let content = b"test data for reflink-or-copy-new"; + fs::write(&src, content).unwrap(); + + let outcome = reflink_or_copy_new(&src, &dst, None).unwrap(); + match outcome { + ReflinkOutcome::Reflinked | ReflinkOutcome::Copied(_) => {} + } + assert!(dst.exists()); + assert_eq!(fs::read(&dst).unwrap(), content); + } + + #[test] + fn test_reflink_or_copy_new_fails_if_dest_exists() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"a").unwrap(); + fs::write(&dst, b"b").unwrap(); + assert!(reflink_or_copy_new(&src, &dst, None).is_err()); + } + + #[cfg(unix)] + #[test] + fn test_reflink_or_copy_new_sets_mode() { + use std::os::unix::fs::PermissionsExt; + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"executable-ish").unwrap(); + + reflink_or_copy_new(&src, &dst, Some(0o600)).unwrap(); + let mode = fs::metadata(&dst).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o600); + } + + #[test] + fn test_reflink_or_copy_atomic_overwrites_destination() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"new content").unwrap(); + fs::write(&dst, b"old content").unwrap(); + + let outcome = reflink_or_copy_atomic(&src, &dst, None).unwrap(); + match outcome { + ReflinkOutcome::Reflinked | ReflinkOutcome::Copied(_) => {} + } + assert_eq!(fs::read(&dst).unwrap(), b"new content"); + } + + #[cfg(unix)] + #[test] + fn test_reflink_or_copy_atomic_sets_mode() { + use std::os::unix::fs::PermissionsExt; + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"data").unwrap(); + + reflink_or_copy_atomic(&src, &dst, Some(0o640)).unwrap(); + let mode = fs::metadata(&dst).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o640); + } + + #[test] + fn test_reflink_or_copy_atomic_empty_file() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"").unwrap(); + + reflink_or_copy_atomic(&src, &dst, None).unwrap(); + assert!(dst.exists()); + assert_eq!(fs::read(&dst).unwrap(), b""); + } + + #[test] + fn test_reflink_or_copy_direct_overwrites() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src"); + let dst = dir.path().join("dst"); + fs::write(&src, b"fresh").unwrap(); + fs::write(&dst, b"stale-and-longer").unwrap(); + reflink_or_copy_direct(&src, &dst, None).unwrap(); + assert_eq!(fs::read(&dst).unwrap(), b"fresh"); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_extract_refuses_symlinked_source() { + use std::os::unix::fs::symlink; + let dir = tempdir().unwrap(); + let secret = dir.path().join("secret"); + fs::write(&secret, b"top secret outside the cache").unwrap(); + let link = dir.path().join("obj_symlink"); + symlink(&secret, &link).unwrap(); + let dest = dir.path().join("restored"); + + assert!(reflink_or_copy_atomic(&link, &dest, None).is_err()); + assert!(reflink_or_copy_direct(&link, &dest, None).is_err()); + assert!(!dest.exists()); + } + + #[test] + fn test_is_reflink_supported_nonexistent_dir_is_false() { + let dir = tempdir().unwrap(); + assert!(!is_reflink_supported(&dir.path().join("does-not-exist"))); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_remember_unsupported_logic() { + let unsupp = u64::MAX - 101; + remember_unsupported( + Some(unsupp), + &io::Error::from_raw_os_error(libc::EOPNOTSUPP), + ); + assert!(device_known_unsupported(unsupp)); + + let notty = u64::MAX - 102; + remember_unsupported(Some(notty), &io::Error::from_raw_os_error(libc::ENOTTY)); + assert!(device_known_unsupported(notty)); + + let xdev = u64::MAX - 103; + remember_unsupported(Some(xdev), &io::Error::from_raw_os_error(libc::EXDEV)); + assert!(!device_known_unsupported(xdev)); + + let transient = u64::MAX - 104; + remember_unsupported(Some(transient), &io::Error::from_raw_os_error(libc::EINTR)); + assert!(!device_known_unsupported(transient)); + } + + #[cfg(target_os = "linux")] + #[test] + fn test_copy_fd_roundtrip() { + let dir = tempdir().unwrap(); + let src_path = dir.path().join("src"); + let data = vec![42u8; 9000]; + fs::write(&src_path, &data).unwrap(); + let src = File::open(&src_path).unwrap(); + let dst_path = dir.path().join("dst"); + fs::write(&dst_path, vec![0u8; 20000]).unwrap(); + let dst = fs::OpenOptions::new().write(true).open(&dst_path).unwrap(); + + let n = copy_fd(&src, &dst).unwrap(); + assert_eq!(n, data.len() as u64); + drop(dst); + assert_eq!(fs::read(&dst_path).unwrap(), data); + } +} diff --git a/src/server.rs b/src/server.rs index 98cedfbbc5..680d871d2d 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1387,11 +1387,13 @@ where stats.cache_errors.increment(&kind, &lang); } - CompileResult::CacheHit(duration) => { + CompileResult::CacheHit(duration, extraction) => { debug!("[{}]: compile result: cache hit", out_pretty); stats.cache_hits.increment(&kind, &lang); stats.cache_read_hit_duration += duration; + stats.objects_reflinked += extraction.objects_reflinked; + stats.objects_copied_fallback += extraction.objects_copied; } CompileResult::CacheMiss(miss_type, dt, duration, future) => { debug!("[{}]: compile result: cache miss", out_pretty); @@ -1610,6 +1612,10 @@ pub struct ServerStats { pub cache_write_errors: u64, /// The number of successful cache writes. pub cache_writes: u64, + /// Cache objects restored by reflink (copy-on-write); only non-zero with `file_clone`. + pub objects_reflinked: u64, + /// Cache objects restored by copying because reflinking was unavailable. + pub objects_copied_fallback: u64, /// The total time spent writing cache entries. pub cache_write_duration: Duration, /// The total time spent reading cache hits. @@ -1670,6 +1676,8 @@ impl Default for ServerStats { forced_recaches: u64::default(), cache_write_errors: u64::default(), cache_writes: u64::default(), + objects_reflinked: u64::default(), + objects_copied_fallback: u64::default(), cache_write_duration: Duration::new(0, 0), cache_read_hit_duration: Duration::new(0, 0), compilations: u64::default(), @@ -1760,6 +1768,16 @@ impl ServerStats { set_stat!(stats_vec, self.cache_timeouts, "Cache timeouts"); set_stat!(stats_vec, self.cache_read_errors, "Cache read errors"); + set_stat!( + stats_vec, + self.objects_reflinked, + "Objects restored by reflink" + ); + set_stat!( + stats_vec, + self.objects_copied_fallback, + "Objects restored by copy" + ); set_stat!(stats_vec, self.forced_recaches, "Forced recaches"); set_stat!(stats_vec, self.cache_write_errors, "Cache write errors"); if advanced { diff --git a/src/test/tests.rs b/src/test/tests.rs index 8d283a1b2b..891da57b4b 100644 --- a/src/test/tests.rs +++ b/src/test/tests.rs @@ -87,6 +87,7 @@ where PreprocessorCacheModeConfig::default(), CacheMode::ReadWrite, vec![], + false, )); let client = Client::new(); diff --git a/tests/integration/Makefile b/tests/integration/Makefile index ab4d13c232..a42437fe9e 100644 --- a/tests/integration/Makefile +++ b/tests/integration/Makefile @@ -31,7 +31,7 @@ endgroup = endif BACKENDS := redis redis-deprecated memcached memcached-deprecated s3 azblob webdav basedirs multilevel multilevel-chain -TOOLS := gcc clang cmake cmake-modules cmake-modules-v4 autotools coverage zstd +TOOLS := gcc clang cmake cmake-modules cmake-modules-v4 autotools coverage zstd file-clone # Map backends to their compose profiles PROFILES_autotools := autotools @@ -42,6 +42,7 @@ PROFILES_cmake := cmake PROFILES_cmake-modules := cmake-modules PROFILES_cmake-modules-v4 := cmake-modules-v4 PROFILES_coverage := coverage +PROFILES_file-clone := file-clone PROFILES_gcc := gcc PROFILES_memcached := memcached PROFILES_memcached-deprecated := memcached @@ -62,6 +63,7 @@ SERVICES_cmake := SERVICES_cmake-modules := SERVICES_cmake-modules-v4 := SERVICES_coverage := +SERVICES_file-clone := SERVICES_gcc := SERVICES_memcached := memcached SERVICES_memcached-deprecated := memcached @@ -101,6 +103,7 @@ help: @echo "Advanced Tests:" @echo " make test-coverage Run Rust coverage instrumentation test" @echo " make test-zstd Run ZSTD compression levels test" + @echo " make test-file-clone Run file_clone (reflink) disk cache test" @echo " make test-basedirs Run basedirs test across all backends" @echo " make test-multilevel Run multi-level cache test across all backends" @echo " make test-multilevel-chain Run multi-level backfill chain test (4 levels)" diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index 4e1a657435..957bbc64ec 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -331,6 +331,17 @@ services: - test - zstd + test-file-clone: + <<: *test-runner + image: rust:latest + entrypoint: /sccache/tests/integration/scripts/test-file-clone.sh + environment: + <<: *common-env + CARGO_INCREMENTAL: "0" + profiles: + - test + - file-clone + test-basedirs: <<: *test-runner image: gcc:latest diff --git a/tests/integration/scripts/test-file-clone.sh b/tests/integration/scripts/test-file-clone.sh new file mode 100755 index 0000000000..e0b247a7ac --- /dev/null +++ b/tests/integration/scripts/test-file-clone.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -euo pipefail + +# Integration test for the `file_clone` (uncompressed reflink / copy-on-write) disk cache. +# Mirrors test-zstd.sh: build (miss) -> cargo clean -> rebuild (hit) with SCCACHE_FILE_CLONE=true. +# +# FS-guarded: the test asserts a correct miss -> hit round-trip on ANY filesystem. On a CoW +# filesystem (Btrfs/XFS/APFS) restored objects are reflinked; elsewhere they fall back to copies. +# Either way `objects_reflinked + objects_copied_fallback` must be > 0 on the hit (a compressed hit +# would leave both at 0), which proves the uncompressed path actually ran. + +SCCACHE="${SCCACHE_PATH:-/sccache/target/debug/sccache}" + +echo "==========================================" +echo "Testing: file_clone (reflink) disk cache" +echo "==========================================" + +echo "Copying test crate to writable location..." +cp -r /sccache/tests/test-crate /build/ +cd /build/test-crate + +export SCCACHE_DIR=/build/file-clone-cache +export SCCACHE_FILE_CLONE=true +TEST_ENV_VAR="test_value_$(date +%s)" && export TEST_ENV_VAR + +"$SCCACHE" --stop-server >/dev/null 2>&1 || true +"$SCCACHE" --start-server + +echo "Build with file_clone (cache miss)..." +cargo build + +echo "Stats after first build:" +"$SCCACHE" --show-stats + +echo "Build again (cache hit)..." +cargo clean +cargo build + +echo "Stats after second build:" +"$SCCACHE" --show-stats + +STATS_JSON=$("$SCCACHE" --show-stats --stats-format=json) +read_stat() { + echo "$STATS_JSON" | python3 -c "import sys, json; d=json.load(sys.stdin).get('stats', {}); print($1)" +} + +HITS=$(read_stat "d.get('cache_hits', {}).get('counts', {}).get('Rust', 0)") +REFLINKED=$(read_stat "d.get('objects_reflinked', 0)") +COPIED=$(read_stat "d.get('objects_copied_fallback', 0)") + +echo "file_clone cache hits (Rust): $HITS" +echo "objects reflinked: $REFLINKED, objects copied (fallback): $COPIED" + +if [ "$HITS" -eq 0 ]; then + echo "ERROR: No cache hits with file_clone" + exit 1 +fi + +RESTORED=$((REFLINKED + COPIED)) +if [ "$RESTORED" -eq 0 ]; then + echo "ERROR: file_clone hit restored 0 objects via reflink/copy (uncompressed path did not run)" + exit 1 +fi + +if [ "$REFLINKED" -gt 0 ]; then + echo "Restored via reflink (copy-on-write filesystem detected)." +else + echo "Restored via copy fallback (filesystem does not support reflinks)." +fi + +"$SCCACHE" --stop-server >/dev/null 2>&1 || true + +echo "==========================================" +echo "PASS: file_clone disk cache test" +echo "Cache hits: $HITS, reflinked: $REFLINKED, copied: $COPIED" +echo "==========================================" diff --git a/tests/system.rs b/tests/system.rs index fbbaeca96b..2a0ea0185d 100644 --- a/tests/system.rs +++ b/tests/system.rs @@ -1985,6 +1985,55 @@ fn test_sccache_command(preprocessor_cache_mode: bool) { } } +#[test] +#[serial] +#[cfg(any(unix, target_env = "msvc"))] +fn test_sccache_command_file_clone() { + let _ = env_logger::try_init(); + let tempdir = tempfile::Builder::new() + .prefix("sccache_system_test_file_clone") + .tempdir() + .unwrap(); + let compilers = find_compilers(); + if compilers.is_empty() { + warn!("No compilers found, skipping test"); + } else { + stop_local_daemon(); + let mut sccache_cfg = sccache_client_cfg(tempdir.path(), false); + sccache_cfg + .cache + .disk + .as_mut() + .expect("disk cache config") + .file_clone = true; + write_json_cfg(tempdir.path(), "sccache-cfg.json", &sccache_cfg); + let sccache_cached_cfg_path = tempdir.path().join("sccache-cached-cfg"); + trace!("start server"); + start_local_daemon( + &tempdir.path().join("sccache-cfg.json"), + &sccache_cached_cfg_path, + ); + for compiler in compilers { + // test_basic_compile hard-codes C-compiler stats assertions, so skip the C++ driver. + if compiler.name != "clang++" { + test_basic_compile(compiler, tempdir.path()); + // A compressed hit would leave both counters at 0, so requiring sum >= 1 + // proves the uncompressed file_clone restore path ran (reflink or copy). + get_stats(|info| { + assert!( + info.stats.objects_reflinked + info.stats.objects_copied_fallback >= 1, + "file_clone hit must reflink or copy >= 1 object (reflinked={}, copied={})", + info.stats.objects_reflinked, + info.stats.objects_copied_fallback + ); + }); + zero_stats(); + } + } + stop_local_daemon(); + } +} + #[test] #[serial] fn test_stats_no_server() {