From 6b1b76684fe3c5bcd925d092425d8d60ef484de7 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Sun, 14 Jun 2026 20:48:32 +0200 Subject: [PATCH 01/20] test(219): add seam-SROA grounding fixture (pack/unpack round-trip) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Faithful local repro of gale #219's C<->Rust decide seam: `decide` returns {action, new_count} packed into an i64; the shim unpacks them. After `--passes inline` the `run` body contains the pack (extend_i32_u/shl/or) → i64 carrier local → unpack (and/shr_u/wrap_i64) round-trip on one value — exactly the residue to dissolve. Verified: optimizes + validates today; inline leaves the round-trip intact (confirmed via `loom optimize --passes inline` + wasm-tools print). The seam-SROA pass (next) must forward the scalar components and drop the carrier, yielding `(action & 0xff) + new_count` — each rewrite Z3-proven. Refs #219 --- loom-core/tests/fixtures/seam_sroa_decide.wat | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 loom-core/tests/fixtures/seam_sroa_decide.wat diff --git a/loom-core/tests/fixtures/seam_sroa_decide.wat b/loom-core/tests/fixtures/seam_sroa_decide.wat new file mode 100644 index 0000000..da582f5 --- /dev/null +++ b/loom-core/tests/fixtures/seam_sroa_decide.wat @@ -0,0 +1,51 @@ +;; #219 — seam SROA / scalar-forwarding fixture. +;; +;; Models the gale C<->Rust "decide" seam: `decide` returns two i32 scalars +;; {action, new_count} PACKED into a single i64 (the u64-return ABI), and the +;; shim immediately UNPACKS them back to scalars. After `--passes inline`, +;; `decide` is inlined into `shim`, so on one value we get: +;; +;; pack: i64.extend_i32_u ; i64.shl ; i64.or (build the u64) +;; carry: local.set $t / local.get $t (the dead i64 carrier) +;; unpack: i64.and ; i64.shr_u ; i32.wrap_i64 (tear it back apart) +;; +;; The u64 is constructed from two i32s and immediately decomposed — textbook +;; SROA. The kill-criterion (gale #219): after the seam-SROA pass the dissolved +;; body has NO i64 pack/unpack and the i64 carrier local is gone; the result is +;; equivalent to operating on `action`/`new_count` directly. +;; +;; Layout: action in bits [0..8), new_count in bits [8..40). +(module + ;; decide: pack {action, new_count} -> u64 + ;; t = (extend_u(new_count) << 8) | (extend_u(action) & 0xff) + (func $decide (param $action i32) (param $new_count i32) (result i64) + local.get $new_count + i64.extend_i32_u + i64.const 8 + i64.shl + local.get $action + i64.extend_i32_u + i64.const 0xff + i64.and + i64.or) + + ;; shim: call decide, then unpack both scalars back and combine. + ;; Post-inline this is the pure pack/unpack round-trip on $t. + ;; Semantically equivalent to: (action & 0xff) + new_count + (func (export "run") (param $action i32) (param $new_count i32) (result i32) + (local $t i64) + local.get $action + local.get $new_count + call $decide + local.set $t + ;; unpack action = (t & 0xff) + local.get $t + i64.const 0xff + i64.and + i32.wrap_i64 + ;; unpack new_count = (t >> 8) + local.get $t + i64.const 8 + i64.shr_u + i32.wrap_i64 + i32.add)) From 5cea5bf4cb7c413d670a0c83b0c30b27e22c77f9 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 15 Jun 2026 03:11:27 +0200 Subject: [PATCH 02/20] test(219): add gale's verified real repro (sem.loom.wasm) + notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gale attached the authentic post-loom-inline dissolved z_impl_k_sem_give on the issue. Decoded + verified: sha256 f81da42d… (matches), 5254 B, valid wasm. It contains the u64 pack/unpack round-trip to dissolve — pack (extend_i32_u/shl/or) at body lines 59-63, unpack (and/shr_u/wrap_i64) at 90/104, with a dead i64 carrier. Grounds the seam-SROA pass against the real body, not just the synthetic fixture. Design confirmed on #219 (proof-carrying: Z3 obligation per ISLE rewrite; ISLE-term route preferred). Implementation (mem2reg + pack/unpack forwarding + DCE) follows. Refs #219 --- loom-core/tests/fixtures/repro-219/README.md | 30 ++++++++++++++++++ .../tests/fixtures/repro-219/sem.loom.wasm | Bin 0 -> 5254 bytes 2 files changed, 30 insertions(+) create mode 100644 loom-core/tests/fixtures/repro-219/README.md create mode 100644 loom-core/tests/fixtures/repro-219/sem.loom.wasm diff --git a/loom-core/tests/fixtures/repro-219/README.md b/loom-core/tests/fixtures/repro-219/README.md new file mode 100644 index 0000000..7e090fd --- /dev/null +++ b/loom-core/tests/fixtures/repro-219/README.md @@ -0,0 +1,30 @@ +# #219 seam-SROA repro + +`sem.loom.wasm` — gale's authentic post-loom-inline dissolved `z_impl_k_sem_give` +(the C↔Rust k_sem_give decide seam). Provided on issue #219; **sha256 +`f81da42d…`, 5254 B** (verified on decode). From +`gale-smart-data/benches/engine_control/silicon/wasm-testbed/repro-loom-seam-sroa/`. + +It still contains the u64 pack/unpack round-trip the seam-SROA pass must dissolve: + +- **pack** (decide builds the u64): `i64.extend_i32_u; i64.shl; i64.or` +- **unpack** (shim tears it back to scalars): `i64.and`, `i64.shr_u`, `i32.wrap_i64` +- a dead `i64` carrier local in between + +## Kill-criterion (gale, on-silicon G474RE) + +- structural: dissolved body has **no i64 pack/unpack**, carrier local gone, + ARM body 83 → ~55–60 insns. +- silicon: sem 860 → toward 471 (LLVM-LTO); mutex 472. gale re-flashes. + +## Plan (design confirmed on the issue — proof-carrying) + +1. wasm-local mem2reg: promote the single-assignment non-escaping i64 carrier so + the pack expression reaches the unpack sites. +2. SROA / pack-unpack algebraic forwarding via ISLE rewrites (preferred: add + `i64.extend_i32_u` / `i32.wrap_i64` ISLE terms), each with a **Z3 proof** + discharged before the rule ships: `(extend_u(a) | (extend_u(b)<>k → b`, `wrap_i64(extend_i32_u(x)) → x`. Then DCE drops the carrier+pack. +3. const dedup/hoist. + +See also the synthetic `../seam_sroa_decide.wat`. diff --git a/loom-core/tests/fixtures/repro-219/sem.loom.wasm b/loom-core/tests/fixtures/repro-219/sem.loom.wasm new file mode 100644 index 0000000000000000000000000000000000000000..04328bac179c4f2f58e3e3f6d34d9be60ef477c8 GIT binary patch literal 5254 zcmd5=U5wmT6+YJ<&rD`^XD3;u`4?W;(nPEA`fvWWp~)_^NgxFYk`fBS@z~e9CNrMF z<4MS-%|b|0 zULMK`1WpmychhLPlae#+W2B5VCo+cKO=1Ofgp%vmM^)hL;6+WYunS z3r*x5Yt_HVEtfmK%OSgS-SRuV_QVk63qN5Q3CcMro?`6k3Y!y!aIk70i8!PgQvUt6i1M&D26pLpC0b@q? zm0^y_P-b%i>6YQJkf&x!7M9^eIs0L8;hHe5xV$x3wd__aeE2$=Lx43IwHix}uEA ziX#$dCtzv}5D8n1Kf7nbcu0&N>Z=ux?R$x#4*);Q71I#-f!Dps&@X}^!*1jN4#MB0 zn1sDZ-NGwCQur=4J!;;f<_gA191iCGDIx|xAdvluWtN3R@H~R-9d<-ACjhBalnN=m z8yTeXT=u_c11Te)gwu@?jWE*tv)@nocf|B#kUf|J#31dgj}%VF^xQ~*XyNo$fn5hj z0%W+fYb-!C3FLg7fi7uJDgW!VpM7;2a`VWtmj-sv=5OHu^@Z6TiG)f6`x6Q3lZ137 zHI~9E?VHQ~XoAkZwv}Q3qqj2* zw-|A~iwwd$)T~kS5;c^#!Z&dAU)YgeLG?{S$~fWg3HcTd(Ivc3%`d6>C^l2?r3{(p z@@x|U!XX;}7}`N9GiE;4;8GuNY|{W-$C?C683iEL{%r(=7pPgpX6hIY{t!ECKUt6H zxYFY&MTix?iA*cx@2OzpBTW;+@R*ckGGJVuKtod{oY$p36wH zqoVL7VxV|al!gphWcWBSyn#yb3Wl$(&<<94-$ZB$?-O`GgZJ~Xva`|=P>vE%ql zDIw#B7ujXwP1DoUH@e=NDWorneiUAzH~&X!&qLDD&0$X)^+p|IIM&pLJzaesdUoJD zmeURJ`)0eY)xF3=uid>gIRO1Zuh$JD@Pi0GaSEG6%IieX@?+SFLfen}(CziX>2`X4 zn}=3w;I|_`=o3>ngmoTvtxNdP3LqGCCP=F@Xv3w@?&0TmH71@$V9kfN-|AQ$J6cO7 zynw@~J>+9%G#RXCKW!4)jIAOjO`C8_Hz$ZqK5F$f+u=^LR&zbwQ0;oddvY6XiCzo6 zMS)tasu`N$7+iO1j-CiKs@DAJGxNTtH|7VrI&b&mpW6bVDCCTyjQmoRG_;C>u}y=h zC>Y}T6KCc%eH0Y*lP2djZ+M>5uywmxPbb&vx6NaqG6REZU>xZeMO(UAZ#A2G&1q;2 z*WhDE4NQivCk|ja#g@mOY-fSdQ%F;y5{B?YUYE?K&47?)Yp7ej1e?+)vOt&=Xq+?Ng`0YL*_!RC|Jyj zg0-tCD}MBukW^LUQYJHvrplW&({;J2nRX-Ac865d5|>S~M^nA9+rdSq6&uBkx}#!J zwW{8545xlKHI&JQMbAg%8)!mZ^*qc)T{o*uui2`_vJEqF=$?`Ke=9L5C+^q=H$1QE zU}4#w5zqaQbzqL`fSg1RwT4=+YAsLIje5&9l3C(sJuX^pOVOgwn`<`FqSrOes5ffd zsa5T2O@9&)KT0M31dNlB0*m3GA32NMR2xlvVmwz@RogRlw`J(MYkR6^yB@D|z2+J% zS6jqKIkdyei^FHIGJNT6gY-j(_?57~cziTGdP^!!z3JA}7PlMdYol4MII9Ci)tc}@ zvq`JhU|5GAIOEj1@t`(6DE~y#L%z5rtN7xm)tr`TR!v=P;H#`!PZ9}E8C?|V zEgz1nOvZ0*-|q&prZi1OlMj!oPS#buR#A1#e#l!CMp7=y`*;9d_0y^|IdCH+U~P8{7B9@BDj}H+!oPJD@z?BjjhpUC*D6Y@w-Hk)I^28^edMVRb3fGRf%oS*pBfc%Nj3DhU zi2r!*X7=@rC{1U@pOpi#OpM;!%qK zCC2|;`g-9&x@djf;9wGR#kO}M#hg`O=T%yyg7jeTjc}&H& zKcN!tdN?I+IhV?8I+Bu3(NcP8X|$G3yU7+|6R~M6y|i@OS~`1&Gpk(X4pZO?8`W%G zVdrl@;mYB5Z^p9vk?mZtdR@#H58*p(CbGj-9$6l@@jQQl4gdfE literal 0 HcmV?d00001 From 701d2a4f51b74f6dcc227c7ce5f682b049241b41 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 15 Jun 2026 05:34:29 +0200 Subject: [PATCH 03/20] feat(219): ISLE term scaffolding for seam-SROA (i32.wrap_i64 / i64.extend_i32_u) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lays the ISLE-layer foundation for the #219 pack/unpack forwarding rules: - types.isle: I32WrapI64 / I64ExtendI32U ValueData variants. - constructors.isle: decl + extern constructor + extern extractor for each. - loom-shared/src/lib.rs: i32_wrap_i64_extract / i64_extend_i32_u_extract (mirror iadd32_extract, single-field). - build.rs: register seam_sroa.isle AND add the two extractors to the fix_extractor_ownership list (cranelift-isle 0.132.1 emits extractor args as owned `Value`; the post-processor rewrites them to `&Value` — without this the generated trie move-then-borrows arg0 and loom-shared won't compile). - seam_sroa.isle: the unconditional, Z3-trivial rule `wrap_i64(extend_i32_u(x)) → x`. loom-shared builds clean. IMPORTANT (next step): the generated ISLE `simplify` is NOT in loom's live rewrite path — build.rs documents this, and the active rewriters are the Rust `rewrite_pure_impl` / `rewrite_with_dataflow` (loom-shared/src/lib.rs), validated by the Z3 translation validator per pass. So this rule does not fire yet; the LIVE seam-SROA rewrite must be added as a `rewrite_pure_impl` match arm (ValueData::I32WrapI64{val} where val is I64ExtendI32U{inner} → inner), with the Z3 verifier discharging the obligation. ISLE terms retained as proof-layer documentation of the identity. Refs #219 --- loom-shared/build.rs | 5 +++++ loom-shared/isle/constructors.isle | 16 ++++++++++++++++ loom-shared/isle/rules/seam_sroa.isle | 18 ++++++++++++++++++ loom-shared/isle/types.isle | 6 ++++++ loom-shared/src/lib.rs | 17 +++++++++++++++++ 5 files changed, 62 insertions(+) create mode 100644 loom-shared/isle/rules/seam_sroa.isle diff --git a/loom-shared/build.rs b/loom-shared/build.rs index 8380229..adfd7e7 100644 --- a/loom-shared/build.rs +++ b/loom-shared/build.rs @@ -29,6 +29,7 @@ fn main() { PathBuf::from("isle/types.isle"), PathBuf::from("isle/constructors.isle"), PathBuf::from("isle/rules/constant_folding.isle"), + PathBuf::from("isle/rules/seam_sroa.isle"), PathBuf::from("isle/rules/default.isle"), ]; @@ -100,6 +101,10 @@ fn fix_extractor_ownership(code: &str) -> String { "iadd64_extract", "isub64_extract", "imul64_extract", + // #219 seam-SROA — unary conversion extractors (cranelift-isle emits + // these with owned `Value`; rewrite to `&Value` like the rest). + "i32_wrap_i64_extract", + "i64_extend_i32_u_extract", ]; for extractor in &extractors { diff --git a/loom-shared/isle/constructors.isle b/loom-shared/isle/constructors.isle index 3c82f6a..fb14ca6 100644 --- a/loom-shared/isle/constructors.isle +++ b/loom-shared/isle/constructors.isle @@ -137,6 +137,22 @@ (decl ipopcnt64 (Value) Value) (extern constructor ipopcnt64 ipopcnt64) +;; ============================================================================ +;; Integer width conversions (#219 seam-SROA) +;; ============================================================================ +;; Both constructor and extractor are declared so the pack/unpack forwarding +;; rules can match these terms on a rule LHS and rebuild them on the RHS. +;; NOTE: the *_extract fns must also be listed in build.rs `fix_extractor_ +;; ownership` so the generated trait takes &Value (cranelift-isle emits owned). + +(decl i32_wrap_i64 (Value) Value) +(extern constructor i32_wrap_i64 i32_wrap_i64) +(extern extractor i32_wrap_i64 i32_wrap_i64_extract) + +(decl i64_extend_i32_u (Value) Value) +(extern constructor i64_extend_i32_u i64_extend_i32_u) +(extern extractor i64_extend_i32_u i64_extend_i32_u_extract) + ;; ============================================================================ ;; Other Constructors ;; ============================================================================ diff --git a/loom-shared/isle/rules/seam_sroa.isle b/loom-shared/isle/rules/seam_sroa.isle new file mode 100644 index 0000000..cd827bc --- /dev/null +++ b/loom-shared/isle/rules/seam_sroa.isle @@ -0,0 +1,18 @@ +;; LOOM - #219 seam-SROA rewrite rules (proof-carrying) +;; +;; Dissolves the C<->Rust decide seam's u64 pack/unpack residue. Each rule +;; carries a Z3 soundness obligation discharged before it ships (the pass +;; output is also translation-validated). The bit-precise masked-forward rules +;; (`(extend_u(a) | (extend_u(b)< a`, `>>k -> b`) land in a later +;; step once the carrier mem2reg brings the pack expression to the unpack site; +;; this file starts with the unconditional identity that needs no side +;; conditions. + +;; wrap_i64(extend_i32_u(x)) = x +;; +;; Soundness: extend_i32_u zero-extends an i32 to i64 (Z3: zero_ext(32, x)); +;; wrap_i64 takes the low 32 bits (Z3: extract(31, 0, .)). For all i32 x, +;; extract(31,0, zero_ext(32, x)) = x. Unconditionally sound — no escape or +;; mask side conditions. Removes the round-trip a packed scalar pays when its +;; carrier is widened to the u64 ABI and immediately narrowed back. +(rule 100 (simplify (i32_wrap_i64 (i64_extend_i32_u x))) x) diff --git a/loom-shared/isle/types.isle b/loom-shared/isle/types.isle index f5cf1c3..35f3c23 100644 --- a/loom-shared/isle/types.isle +++ b/loom-shared/isle/types.isle @@ -115,6 +115,12 @@ (I64Ctz (val Value)) (I64Popcnt (val Value)) + ;; Integer width conversions (#219 seam-SROA — needed so the pack/unpack + ;; forwarding rules can match on these terms). Field name `val` mirrors the + ;; Rust ValueData variants in loom-shared/src/lib.rs. + (I32WrapI64 (val Value)) + (I64ExtendI32U (val Value)) + ;; Select instruction (Select (cond Value) (true_val Value) (false_val Value)) diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index b1d70bc..a182794 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -2132,6 +2132,23 @@ pub fn i64_extend_i32_u(val: Value) -> Value { Value(Box::new(ValueData::I64ExtendI32U { val })) } +/// Extractor for i32.wrap_i64 — used by ISLE rules to pattern-match the term +/// on a rule LHS (#219 seam-SROA). Returns the inner operand. +pub fn i32_wrap_i64_extract(val: &Value) -> Option { + match val.0.as_ref() { + ValueData::I32WrapI64 { val } => Some(val.clone()), + _ => None, + } +} + +/// Extractor for i64.extend_i32_u — ISLE rule LHS matching (#219 seam-SROA). +pub fn i64_extend_i32_u_extract(val: &Value) -> Option { + match val.0.as_ref() { + ValueData::I64ExtendI32U { val } => Some(val.clone()), + _ => None, + } +} + // ============================================================================ // Float-to-Integer Truncation Constructors (trapping) // ============================================================================ From ae8f08e048730ee66f2923fb14f5a6fd064b6617 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 15 Jun 2026 06:47:00 +0200 Subject: [PATCH 04/20] =?UTF-8?q?feat(219):=20live=20seam-SROA=20rewrite?= =?UTF-8?q?=20=E2=80=94=20wrap=5Fi64(extend=5Fi32=5Fu(x))=20=E2=86=92=20x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual rewrite (the ISLE rule committed earlier is in loom's dead simplify path; the live rewriters are the Rust rewrite_pure_impl/rewrite_with_dataflow, validated by the Z3 translation validator per pass). Adds rewrite_pure_impl match arms for the two #219 conversion terms: - I32WrapI64: recurse operand; if it is I64ExtendI32U(inner) → inner (unconditionally sound: extract(31,0, zero_ext(32,x)) = x — the u64-ABI round-trip a packed scalar pays at the dissolved decide seam); also folds wrap_i64(i64.const N) → i32.const (low 32). - I64ExtendI32U: recurse operand; folds extend_i32_u(i32.const N) → i64.const (zero-extended). Test: test_seam_sroa_wrap_extend_identity — wrap_i64(extend_i32_u(local.get 0)) dissolves to local.get 0. Full `cargo test --release --lib --test-threads=4` green, 0 failures. This is the foundation case. The sem repro's full dissolution additionally needs the carrier mem2reg + the masked and/shr_u forwarding rules (steps 2-3), since its unpack uses and/shr_u (not wrap) on the packed u64. Refs #219 --- loom-core/src/lib.rs | 19 +++++++++++++++++++ loom-shared/src/lib.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 68dddbf..cfcfea7 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -17746,6 +17746,25 @@ mod tests { assert_eq!(instrs[1], Instruction::I32TruncF32S); } + #[test] + fn test_seam_sroa_wrap_extend_identity() { + // #219 seam-SROA: wrap_i64(extend_i32_u(x)) → x. Zero-extending an i32 to + // i64 then wrapping back to i32 is the identity — the u64-ABI round-trip + // a packed scalar pays at the dissolved decide seam. Must dissolve to the + // bare operand (no wrap/extend left). + use loom_isle::{i32_wrap_i64, i64_extend_i32_u, local_get, rewrite_pure}; + let x = local_get(0); // an i32 value + let round_trip = i32_wrap_i64(i64_extend_i32_u(x)); + let simplified = rewrite_pure(round_trip); + + let instrs = terms::terms_to_instructions(&[simplified]).unwrap(); + assert_eq!( + instrs, + vec![Instruction::LocalGet(0)], + "#219: wrap_i64(extend_i32_u(x)) must dissolve to x" + ); + } + #[test] fn test_conversion_trunc_sat_folds_nan_to_zero() { // i32.trunc_sat_f32_s of NaN → i32.const 0 (saturating: NaN→0) diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index a182794..fd089af 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -4362,6 +4362,32 @@ fn rewrite_pure_impl(val: Value) -> Value { } } + // Integer width conversions (#219 seam-SROA). The live rewrite for the + // decide seam's u64 pack/unpack round-trip. Z3-validated per pass. + ValueData::I32WrapI64 { val } => { + let val_simplified = rewrite_pure(val.clone()); + match val_simplified.data() { + // #219 seam-SROA: wrap_i64(extend_i32_u(x)) = x. zero-extending an + // i32 to i64 then taking the low 32 bits is the identity on i32 — + // unconditionally sound (Z3: extract(31,0, zero_ext(32,x)) = x). + ValueData::I64ExtendI32U { val: inner } => inner.clone(), + // Constant folding: wrap_i64(i64.const N) → i32.const (low 32 bits). + ValueData::I64Const { val: v } => iconst32(Imm32(v.value() as i32)), + _ => i32_wrap_i64(val_simplified), + } + } + + ValueData::I64ExtendI32U { val } => { + let val_simplified = rewrite_pure(val.clone()); + match val_simplified.data() { + // Constant folding: extend_i32_u(i32.const N) → i64.const (zero-extended). + ValueData::I32Const { val: v } => { + iconst64(Imm64((v.value() as u32 as u64) as i64)) + } + _ => i64_extend_i32_u(val_simplified), + } + } + // Select instruction optimization ValueData::Select { cond, From 43564411a40bcd3eb2d20d70d2cf305340e7c5ea Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 16 Jun 2026 21:33:43 +0200 Subject: [PATCH 05/20] feat(219): step-3 masked pack/unpack SROA rules (the &mask and >>k halves) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dissolves the u64-pack unpack arithmetic, in rewrite_pure_impl (Z3-validated per pass; pure-integer so the verifier actually verifies — no skip). Each rule is unconditionally sound with only constant side-conditions (no range analysis): I64And (the `& mask` unpack): - (shl Z k) & M → 0 when M's set bits are all below k (M>>k==0, 0> k` unpack): - (shr_u (shl Z k) k) → Z & (u64::MAX>>k) (shl-then-shr-same masks low 64-k) - (shr_u (or P Q) k) → (P>>k)|(Q>>k) when an operand is (shl _ k) (logical shift distributes over OR; targeted to the pack shape — no general bloat; recursion collapses the shl side via the rule above) Together these reduce a forwarded pack `(or (shl (extend_u hi) 32) lo)`: & 0xff → lo & 0xff (low field) >> 32 → (extend_u hi) & 0xffffffff (high field; lo const shifts out) Tests: test_seam_sroa_mask_clears_shifted_pack, test_seam_sroa_shr_extracts_ high_field (+ the step-1 wrap/extend test). Full `cargo test --release --lib --test-threads=4` green (399/28/17, 0 failures). These fire once inlining delivers pack/unpack adjacency (the inline-modelability extension for control-flow + trap-call decides is the remaining prerequisite, tracked separately — higher blast radius, needs gale silicon validation). Refs #219 --- loom-core/src/lib.rs | 56 +++++++++++++++++++++++ loom-shared/src/lib.rs | 101 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index cfcfea7..d281781 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -17765,6 +17765,62 @@ mod tests { ); } + #[test] + fn test_seam_sroa_mask_clears_shifted_pack() { + // #219 seam-SROA: (or (shl (extend_u x) 32) Y) & 0xff → Y & 0xff. + // The high-shifted field of a u64 pack cannot survive a low-byte unpack + // mask, so the whole high half drops out. This is the `& mask` half of + // dissolving the sem decide's pack/unpack round-trip. + use loom_isle::{ + Imm64, i64_extend_i32_u, iand64, iconst64, ior64, ishl64, local_get, rewrite_pure, + }; + let high = ishl64(i64_extend_i32_u(local_get(0)), iconst64(Imm64(32))); + let low = i64_extend_i32_u(local_get(1)); // the surviving low field + let pack = ior64(high, low); + let masked = iand64(pack, iconst64(Imm64(0xff))); + let simplified = rewrite_pure(masked); + + // Expect (extend_u(local.get 1)) & 0xff — the shifted high half is gone. + let want = terms::terms_to_instructions(&[iand64( + i64_extend_i32_u(local_get(1)), + iconst64(Imm64(0xff)), + )]) + .unwrap(); + let got = terms::terms_to_instructions(&[simplified]).unwrap(); + assert_eq!( + got, want, + "#219: (or (shl _ 32) Y) & 0xff must dissolve to Y & 0xff" + ); + } + + #[test] + fn test_seam_sroa_shr_extracts_high_field() { + // #219 seam-SROA: (shr_u (or (shl (extend_u x) 32) const) 32) extracts the + // HIGH field of a u64 pack → (extend_u x) & 0xffffffff. The low (const) + // field shifts out (logical shift distributes over OR; shl-then-shr-same + // masks to the low 64-k bits). Mirrors the sem decide whose low field is + // a constant 0/1. + use loom_isle::{ + Imm64, i64_extend_i32_u, iand64, iconst64, ior64, ishl64, ishru64, local_get, + rewrite_pure, + }; + let high = ishl64(i64_extend_i32_u(local_get(0)), iconst64(Imm64(32))); + let pack = ior64(high, iconst64(Imm64(1))); // low field = const (like local 3 = 0/1) + let unpacked = ishru64(pack, iconst64(Imm64(32))); + let simplified = rewrite_pure(unpacked); + + let want = terms::terms_to_instructions(&[iand64( + i64_extend_i32_u(local_get(0)), + iconst64(Imm64(0xffff_ffff)), + )]) + .unwrap(); + let got = terms::terms_to_instructions(&[simplified]).unwrap(); + assert_eq!( + got, want, + "#219: (shr_u (or (shl (extend_u x) 32) const) 32) must extract (extend_u x) & 0xffffffff" + ); + } + #[test] fn test_conversion_trunc_sat_folds_nan_to_zero() { // i32.trunc_sat_f32_s of NaN → i32.const 0 (saturating: NaN→0) diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index fd089af..2b9dc3b 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -3361,6 +3361,51 @@ fn are_values_equal(lhs: &Value, rhs: &Value) -> bool { } } +/// #219 seam-SROA helper: true if `op` is `(i64.shl _ (i64.const k))` with +/// `0 < k < 64` and every set bit of `mask` is below k (`(mask as u64) >> k == +/// 0`). In that case `(op & mask) == 0` for ALL shift inputs — the shifted +/// value only occupies bits [k,64), which the mask zeroes. k is restricted to +/// (0,64) so the wasm shift-amount-mod-64 wrap can't change the effective +/// shift; k>=64 is left to the verifier rather than reasoned about here. +fn i64_shl_cleared_by_mask(op: &Value, mask: &Imm64) -> bool { + if let ValueData::I64Shl { rhs, .. } = op.data() { + if let ValueData::I64Const { val: k } = rhs.data() { + let k = k.value(); + if k > 0 && k < 64 { + return (mask.value() as u64) >> (k as u64) == 0; + } + } + } + false +} + +/// #219 seam-SROA helper: true if `shl_amt` is the constant `k` and `0 < k < +/// 64` — i.e. an `(i64.shl _ k)` whose amount equals the enclosing `shr_u`'s. +/// `(Z << k) >> k == Z & (u64::MAX >> k)` exactly when k is in (0,64) (no wasm +/// shift-mod-64 wrap). +fn i64_shr_undoes_shl(shl_amt: &Value, shr_amt: &Imm64) -> bool { + if let ValueData::I64Const { val: k2 } = shl_amt.data() { + let k = shr_amt.value(); + return k == k2.value() && k > 0 && k < 64; + } + false +} + +/// #219 seam-SROA helper: true if `op` is `(i64.shl _ (i64.const amt))` with +/// `0 < amt < 64`. Used to target the shr_u-over-or distribution at the pack +/// shape (`(or (shl Z k) B) >> k`) so it only fires where it dissolves. +fn i64_is_shl_by(op: &Value, amt: i64) -> bool { + if amt <= 0 || amt >= 64 { + return false; + } + if let ValueData::I64Shl { rhs, .. } = op.data() { + if let ValueData::I64Const { val: k } = rhs.data() { + return k.value() == amt; + } + } + false +} + /// Stateless simplification (expression-level only) fn rewrite_pure_impl(val: Value) -> Value { match val.data() { @@ -3730,6 +3775,37 @@ fn rewrite_pure_impl(val: Value) -> Value { // Algebraic: x & -1 = x (all bits set) (_, ValueData::I64Const { val }) if val.value() == -1 => lhs_simplified, (ValueData::I64Const { val }, _) if val.value() == -1 => rhs_simplified, + + // #219 seam-SROA: (shl Z k) & M → 0 when M's set bits are all + // below k — the left shift zeroes bits [0,k), the mask zeroes + // bits [k,64), so nothing survives. Unconditionally sound for + // any Z (Z3: (Z<> k == 0). Dissolves the + // high half of a u64 pack under a low-byte unpack mask. + (ValueData::I64Shl { .. }, ValueData::I64Const { val: m }) + if i64_shl_cleared_by_mask(&lhs_simplified, m) => + { + iconst64(Imm64(0)) + } + (ValueData::I64Const { val: m }, ValueData::I64Shl { .. }) + if i64_shl_cleared_by_mask(&rhs_simplified, m) => + { + iconst64(Imm64(0)) + } + // #219 seam-SROA: (or A B) & M → (survivor & M) when one OR + // operand is a left shift the mask clears. Recurse so the + // survivor (and a both-shifted case) simplifies further. + (ValueData::I64Or { lhs: a, rhs: b }, ValueData::I64Const { val: m }) + if i64_shl_cleared_by_mask(a, m) || i64_shl_cleared_by_mask(b, m) => + { + let survivor = if i64_shl_cleared_by_mask(a, m) { b } else { a }; + rewrite_pure(iand64(survivor.clone(), iconst64(*m))) + } + (ValueData::I64Const { val: m }, ValueData::I64Or { lhs: a, rhs: b }) + if i64_shl_cleared_by_mask(a, m) || i64_shl_cleared_by_mask(b, m) => + { + let survivor = if i64_shl_cleared_by_mask(a, m) { b } else { a }; + rewrite_pure(iand64(survivor.clone(), iconst64(*m))) + } _ => iand64(lhs_simplified, rhs_simplified), } } @@ -3825,6 +3901,31 @@ fn rewrite_pure_impl(val: Value) -> Value { } // Algebraic: x >> 0 = x (_, ValueData::I64Const { val }) if (val.value() & 0x3F) == 0 => lhs_simplified, + + // #219 seam-SROA: (shr_u (shl Z k) k) → Z & (low 64-k bits). + // Shifting left by k then logically right by the same k clears + // the high k bits and keeps the low 64-k. Unconditionally sound + // for 0>k == Z & (2^(64-k)-1)). Dissolves the + // high half of a u64 pack under a >>k unpack. + (ValueData::I64Shl { lhs: z, rhs: shamt }, ValueData::I64Const { val: k }) + if i64_shr_undoes_shl(shamt, k) => + { + let mask = (u64::MAX >> (k.value() as u64)) as i64; + rewrite_pure(iand64(z.clone(), iconst64(Imm64(mask)))) + } + // #219 seam-SROA: (shr_u (or P Q) k) → (or (P>>k) (Q>>k)) when an + // OR operand is (shl _ k) — logical right shift distributes over + // bitwise OR (sound), and recursing lets the shl side collapse + // via the rule above. Targeted to the pack shape (matching shl + // present) so it never bloats unrelated `(or _ _) >> k`. + (ValueData::I64Or { lhs: p, rhs: q }, ValueData::I64Const { val: k }) + if i64_is_shl_by(p, k.value()) || i64_is_shl_by(q, k.value()) => + { + rewrite_pure(ior64( + ishru64(p.clone(), iconst64(*k)), + ishru64(q.clone(), iconst64(*k)), + )) + } _ => ishru64(lhs_simplified, rhs_simplified), } } From 2ab674c72d2bd8c46d2eb0903a2c3ba2cf695ebb Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 06:50:23 +0200 Subject: [PATCH 06/20] =?UTF-8?q?docs(219):=20PR-C=20design=20=E2=80=94=20?= =?UTF-8?q?precise=20acyclic=20CF=20symbolic=20execution=20(verifier)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design for the #219 inline prerequisite: wire in precise br_table/br_if/br symbolic execution (the deferred ExecutionState/merge_states scaffolding) so the verifier can soundly prove inline-equivalence of br_table-dispatching decides. Approach: state-merging-with-ITE via a label/continuation stack (NOT path forking), exact for the acyclic single-exit case. Grounded in Alive2 (PLDI'21, the production SMT bounded-TV precedent), Kuznetsov state-merging (PLDI'12), WASP (ECOOP'22 wasm br_table rules), Boogie block-predicate WP. Includes the soundness argument, the explicit pitfalls (selector constraint, fall-through join, ⊥-not- havoc for trap arms, branch arity, over-approx-and-skip), 2-phase staging (sem_give integer first, then memory for mutex/pipe), and the adversarial verify-or-revert + silicon test gate. For gale review before implementation. Refs #219 --- docs/design/pr-c-precise-cf-verification.md | 168 ++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 docs/design/pr-c-precise-cf-verification.md diff --git a/docs/design/pr-c-precise-cf-verification.md b/docs/design/pr-c-precise-cf-verification.md new file mode 100644 index 0000000..cf54553 --- /dev/null +++ b/docs/design/pr-c-precise-cf-verification.md @@ -0,0 +1,168 @@ +# PR-C: Precise acyclic control-flow symbolic execution in the Z3 verifier + +Status: **design, for review** (gale #219). Prerequisite for inlining the 5 +`br_table`-dispatching decides (sem_give 860, mutex_unlock 472, pipe_write, +pipe_read, msgq_put) and for removing the existing `BrIf`/`BrTable` skips in +LICM/CSE/code_folding/coalesce_locals. + +## Problem + +loom's live verifier (`verify.rs`) does **not** soundly model multi-way control +flow. In both `encode_function_to_smt_impl_inner` and `encode_block_body`: + +- `BrIf(_depth)` pops the condition and **continues** ("a more precise encoding + would fork paths here"). +- `BrTable { .. }` pops the index and **`break`s** ("treat as terminating for + now"). +- `Br(_depth)` / `Return` `break`; branch **depth is ignored** (no label stack). + +So a function whose result depends on a `br_table` dispatch is modeled as if the +dispatch didn't happen. Reusing this to prove an inline equivalent would let Z3 +pick any arm for any selector → a **false** equivalence → an unsound inline that +loom's structural CI cannot catch (the #196/#220 failure class). This is why +LICM/CSE/etc. currently **skip** `BrIf`/`BrTable` functions entirely. + +The precise machinery already exists as **dead scaffolding**, flagged in-source +as the deferred upgrade: `struct ExecutionState { stack, locals, globals, +path_condition: Bool, reachable }`, `merge_states(cond, t, f)`, and +`BlockResult.branch_depth`. `merge_bv(cond, a, b) = cond.ite(a, b)` is live (used +for `If`). PR-C wires the scaffolding in. + +## Chosen approach (with precedent) + +**State-merging-with-ITE driven by an explicit label/continuation stack — never +path-forking.** For a bounded, acyclic (loops=0 on the critical path), +single-normal-exit region this is **exact** (a passing SMT check is a real +proof, not a bounded approximation), and it produces **one** verification +condition instead of N per-arm obligations. + +Precedent (all verified against primary sources): + +- **Alive2** (Lopes, Lee, Hur, Liu, Regehr, *Bounded Translation Validation for + LLVM*, PLDI 2021) — the closest production precedent. "We do **not** fork + expressions across paths in the CFG"; phi/branch merges become "a single SMT + expression per register"; the final state is "a linear chain of `ite` + expressions"; an explicit **`ub`** flag makes ill-defined paths refine rather + than vanish. `switch` is the N-way generalization of its 2-way branch merge. +- **Kuznetsov, Kinder, Bucur, Candea**, *Efficient State Merging in Symbolic + Execution*, PLDI 2012 — the exact merge rule `(ℓ, pc'∨pc'', λv. ite(pc', + s'[v], s''[v]))`; "does not over-approximate, no false positives." +- **WASP** (Marques et al., *Concolic Execution for WebAssembly*, ECOOP 2022) — + the literal wasm rules: `br_table` arm `k≤n` → `Cont(br j_{k+1})` with path + condition `ŝ = k`; default `k≥n` → `Cont(br jn)` with `ŝ ≥ n`. The selector + constraint is conjoined onto every arm; the target is resolved by label index. +- **Barnett & Leino**, *Weakest-Precondition of Unstructured Programs*, PASTE + 2005 — per-block predicates keep the VC **linear** in paths (a wide `br_table` + doesn't blow up); acyclic ⇒ topological, no recursion. +- **WebAssembly Core Spec** + Haas et al. (PLDI 2017) — wasm is structured with + a label stack; `br N` targets the N-th enclosing label (0 = innermost); + `br_table` takes the default when the operand is out of bounds. + +Per-path forking is also sound but is the wrong fit: exponential over a wide +`br_table`, N separate obligations. + +## The model + +Carry a single symbolic `ExecutionState`: +`{ stack: Vec, locals: Vec, globals: Vec, memory: Array, + path_condition: Bool, trapped: Bool }` (`trapped` = Alive2's `ub`/⊥). + +A **label/continuation stack**: each entry records `{ arity, join: JoinAccumulator }`. +`block`/`if`/`loop` push a label; the entry's `join` accumulates the merged state +of every branch that targets it. + +- `block tf … end`, `if … end`: push a label whose continuation is the code + **after** `end` (forward join). +- `br N`: resolve the N-th label from the top; **merge** the current state into + that label's `join` under `path_condition` (carry `arity` result values); mark + the current path terminated. +- `br_if N`: split — branch path `pc ∧ (cond≠0)` merges into the target join; + fall-through path `pc ∧ (cond=0)` continues. +- `br_table j0..jn-1 (default jd)` (per WASP): for each arm `i`, merge under + `pc ∧ (sel = i)` into label `ji`'s join; default under `pc ∧ (sel ≥ n)` into + `jd`'s join. Guards **partition** the selector domain (totality from the + default complement). +- At each `end` / the function exit (the arms' post-dominator), the label's + `join` is the **ITE-merge** of all incoming branches: `path_condition` = + disjunction; each value = `ite(guard, …)` (reusing `merge_bv` / `merge_states`). +- `unreachable`, and a `Call` to a no-return callee (`is_noreturn_callee`: no + `Return`, no `Br*` to the function label, every path ends in `Unreachable`), + set `trapped` (⊥) — **never havoc**. They constrain reachability only; on a + trapped path the return value is don't-care and the path drops out of the + result ITE. + +**Crucially**: this *one* executor models **both** the original `call decide` +(by-body) and the inlined body, so they yield the **same** merged expression → +equivalence is provable (the #155 by-body principle, extended to CF). + +## Soundness — and the pitfalls we explicitly guard + +1. **Exact for acyclic** — no fixpoint, no loop invariants; finite single pass; + merge only at structured-block ends + the single exit (post-dominators). +2. **Selector constraint preserved** — every `br_table` arm carries `sel = i`, + default carries `sel ≥ n`; guards partition the domain. (Avoids the + "terminate" false-equivalence.) +3. **Fall-through resolved via the label stack** to the correct join (a `br` to + an outer label skips intervening joins). +4. **⊥, not havoc, for trap/unreachable/no-return arms** — they refine, never + forge a matching value (the #155/#159 + Alive2 `ub` discipline). +5. **Branch arity recorded** — `br N` carries the label's result values; the + merged stack shape stays correct. +6. **Over-approximate-and-skip** anything unmodeled (LOOM's "skip rather than + risk"): any instruction/shape outside the precise model → the callee is not + by-body-modelable → opaque fallback (no inline), never a false ≡. + +## Scope / staging + +- **Loops stay out** — still unrolled (`MAX_LOOP_UNROLL`) / k-induction / skipped + as today. PR-C is acyclic-only. A back-edge ⇒ not by-body-modelable. +- **Phase 1 — integer acyclic CF (no memory):** activate `ExecutionState` + + label stack + `merge_states`; implement `Br`/`BrIf`/`BrTable`/`Block`/`If` + precisely in `encode_block_body`; add `is_noreturn_callee` + diverge-on-trap; + extend `is_inline_modelable_instr`/`callee_inlinable_by_body` to admit acyclic + CF + no-return calls; route the by-body modeler through the precise executor. + **Unblocks sem_give (860)** — pure integer + br_table, no memory. +- **Phase 2 — thread `memory: Array` through the CF executor** (today + `encode_block_body` has no memory param). Unblocks **mutex_unlock (472)** + + **pipe_write/read** (sret stores through the shadow frame). + +## Test strategy (the soundness gate) + +- **Adversarial verify-or-revert** (the hard gate): deliberately-wrong inlined + bodies MUST produce a Z3 counterexample → revert — wrong arm selected, dropped + selector guard, dropped trap branch, mis-merged join, wrong branch depth. +- **Differential**: correct inline of a `br_table` fixture proves; a near-miss + reverts. Fixtures ordered per gale: msgq_put (trap-only) → sem_give + (`block`+`br_table`+`panic→unreachable`) → mutex_unlock (most blocks, sret). +- **Unit**: `is_noreturn_callee`, the label-stack resolution, `merge_states`. +- **Regression**: full `cargo test` + dogfooding (loom optimizing itself) + unchanged; re-validate the LICM/CSE/etc. paths once their `br_table` skips are + removed (a bonus coverage win, but it widens what they verify). +- **e2e + silicon**: `loom optimize` on `repro-219/sem.loom.wasm` dissolves (no + `call $..._decide`, no i64 pack/unpack in `z_impl_k_sem_give`); then gale's + G474RE re-flash (sem_give 860→, mutex_unlock 472→) is the kill-criterion. + +## Risk + +Substantial, soundness-critical (touches the core verifier). De-risked by: the +acyclic restriction (the only source of unsoundness/incompleteness — loops — is +excluded), the existing `ExecutionState`/`merge_states` scaffolding, the +Alive2/WASP precedent, and the adversarial-revert + silicon gates. Staged so +Phase 1 (sem_give) lands and validates before Phase 2 (memory). WIP branch, no +merge until adversarial tests pass + gale silicon-validates. + +## References + +- Lopes, Lee, Hur, Liu, Regehr. *Alive2: Bounded Translation Validation for + LLVM.* PLDI 2021. https://users.cs.utah.edu/~regehr/alive2-pldi21.pdf +- Kuznetsov, Kinder, Bucur, Candea. *Efficient State Merging in Symbolic + Execution.* PLDI 2012. https://dslab.epfl.ch/pubs/stateMerging.pdf +- Avgerinos, Rebert, Cha, Brumley. *Enhancing Symbolic Execution with + Veritesting.* ICSE 2014. https://softsec.kaist.ac.kr/~sangkilc/papers/avgerinos-icse14.pdf +- Marques, Fragoso Santos, Santos, Adão. *Concolic Execution for WebAssembly + (WASP).* ECOOP 2022. https://drops.dagstuhl.de/opus/volltexte/2022/16239/pdf/LIPIcs-ECOOP-2022-11.pdf +- Barnett & Leino. *Weakest-Precondition of Unstructured Programs.* PASTE 2005. + https://www.microsoft.com/en-us/research/wp-content/uploads/2005/01/krml157.pdf +- Haas et al. *Bringing the Web up to Speed with WebAssembly.* PLDI 2017. +- Van Hattum et al. *Lightweight, Modular Verification for Wasm-to-Native + Instruction Selection (Crocus/veri-isle).* ASPLOS 2024. From 9808bfebf9c42e1e044f17f2679ea159c2c05f06 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 08:07:49 +0200 Subject: [PATCH 07/20] =?UTF-8?q?feat(219):=20PR-C=20M2=20=E2=80=94=20is?= =?UTF-8?q?=5Fnoreturn=5Fcallee=20classifier=20(divergent-call=20detection?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First component of the PR-C precise-CF inline-verifier extension (design approved by gale on #219, 2026-06-17). `is_noreturn_callee(func)`: conservative-sound — true iff the body has no Return and no Br/BrIf/BrTable anywhere (recursing) and ends in Unreachable. Admits exactly the core::panicking helpers (panic_fmt → unreachable; panic_const_add_overflow → call panic_fmt; unreachable) and rejects any normal-returning or branching function. The by-body inline modeler (PR-C M3, next) will use this to encode a `call` to such a callee as ⊥ "diverge if reached" — never havoc — so a trapping branch constrains only reachability, never the post-state (Alive2 `ub` discipline; avoids the #155/#159 havoc-vs-concrete false-positive surface). Test: test_is_noreturn_callee (bare trap / trap-after-call / normal-return / no-trailing-trap / branch-escape). #[allow(dead_code)] until the M3 gate wires it. Refs #219 --- loom-core/src/lib.rs | 34 +++++++++++++++++++++++++++++++ loom-core/src/verify.rs | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index d281781..b0cac8f 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -17821,6 +17821,40 @@ mod tests { ); } + #[cfg(feature = "verification")] + #[test] + fn test_is_noreturn_callee() { + // PR-C (#219): the divergent-call classifier. No-return = no Return, no + // branch anywhere, body ends in Unreachable (the core::panicking shape). + let cases: &[(&str, bool)] = &[ + // bare trap + ("(module (func (export \"f\") unreachable))", true), + // call (to an import) then trap — the panic_const_add_overflow shape + ( + "(module (import \"env\" \"p\" (func)) (func (export \"f\") call 0 unreachable))", + true, + ), + // normal return — must NOT be no-return + ("(module (func (export \"f\") (result i32) i32.const 0))", false), + // ends in a value, not unreachable + ("(module (func (export \"f\") nop))", false), + // contains a branch (could escape to the fn label) — conservatively false + ( + "(module (func (export \"f\") (block br 0) unreachable))", + false, + ), + ]; + for (wat, expected) in cases { + let module = parse::parse_wat(wat).expect("parse"); + let f = &module.functions[0]; + assert_eq!( + crate::verify::is_noreturn_callee(f), + *expected, + "#219 is_noreturn_callee mismatch for: {wat}" + ); + } + } + #[test] fn test_conversion_trunc_sat_folds_nan_to_zero() { // i32.trunc_sat_f32_s of NaN → i32.const 0 (saturating: NaN→0) diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index 8471cc1..f07ddb7 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -209,6 +209,51 @@ fn callee_inlinable_by_body(func: &Function) -> bool { func.instructions.iter().all(is_inline_modelable_instr) } +/// PR-C (#219): true if `func` provably never returns to its caller — every +/// path traps. Conservative-sound definition: the body contains NO `Return` +/// and NO branch (`Br`/`BrIf`/`BrTable`) anywhere (recursing into nested +/// control flow), and its final top-level instruction is `Unreachable`. With +/// no `return` and no branch, the only way to leave the function is to fall off +/// the end — which `Unreachable` makes a trap. This admits exactly the +/// `core::panicking` helpers here (`panic_fmt → unreachable`, +/// `panic_const_add_overflow → call panic_fmt; unreachable`) and rejects any +/// normal-returning or branching function. +/// +/// Used by the by-body inline modeler to encode a `call` to such a callee as +/// "diverge if reached" (⊥) rather than havoc: control never returns, so the +/// call constrains only the reachability of the trapping branch, never the +/// post-state (Alive2's `ub` discipline). Modeling it as a fresh symbolic +/// (havoc) would reopen the #155/#159 havoc-vs-concrete false-positive surface. +#[cfg(feature = "verification")] +#[allow(dead_code)] // wired into the by-body inline gate in PR-C M3 (#219) +pub(crate) fn is_noreturn_callee(func: &Function) -> bool { + !body_has_branch_or_return(&func.instructions) + && matches!(func.instructions.last(), Some(Instruction::Unreachable)) +} + +/// Recursively detect any `Return` or branch (`Br`/`BrIf`/`BrTable`) in a body, +/// descending into Block/Loop/If. Used by `is_noreturn_callee` — a branch could +/// transfer control to the function-level label (a return-via-br), so any +/// branch conservatively disqualifies the "every path traps" classification. +#[cfg(feature = "verification")] +fn body_has_branch_or_return(instrs: &[Instruction]) -> bool { + instrs.iter().any(|i| match i { + Instruction::Return + | Instruction::Br(_) + | Instruction::BrIf(_) + | Instruction::BrTable { .. } => true, + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + body_has_branch_or_return(body) + } + Instruction::If { + then_body, + else_body, + .. + } => body_has_branch_or_return(then_body) || body_has_branch_or_return(else_body), + _ => false, + }) +} + /// loom#151/#155: whitelist of instructions the by-body inline modeler can /// encode faithfully and **identically to the main encoder**. The callee must /// be straight-line (no Block/Loop/If), leaf (no calls), single-result, and From 65193f540bc03bd8fc4a1fc5515968284516c025 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 10:58:09 +0200 Subject: [PATCH 08/20] =?UTF-8?q?feat(219):=20PR-C=20M1a=20=E2=80=94=20pre?= =?UTF-8?q?cise=20acyclic=20CF=20executor=20(isolated,=20Z3-tested)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The soundness-critical core of the inline-modelability extension: a precise symbolic executor for structured, acyclic wasm control flow (block / if / br / br_if / br_table) by state-merging-with-ITE over a label/join stack — never path-forking. For a bounded acyclic region this is EXACT (a passing SMT check is a real proof). Precedent: Alive2 (PLDI'21), Kuznetsov state-merging (PLDI'12), WASP br_table rules (ECOOP'22). See docs/design/pr-c-precise-cf-verification.md. Built as a NEW ISOLATED function, not by mutating the shared encoders (encode_block_body / encode_function_to_smt_impl_inner stay untouched -> zero regression risk to existing passes). encode_simple_instruction's catch-all HAVOCS (pushes a fresh BV without popping operands), so delegating an unmodeled op would desync the stack; instead an allowlist gate (is_acyclic_modelable_body) front-loads soundness and the executor runs only on admitted functions. Over-approximate-and-skip everything else: loops (back-edge), calls, memory, unknown opcodes, multi-result/param blocks. br_table default guard is UNSIGNED bvuge(sel, n) — a signed >= would leave selectors in [2^31, 2^32) in a partition gap. All #[cfg(feature="verification")] + #[allow(dead_code)] until M3 wires it into inline_functions' verify-or-revert. Tests (5, green): br_table switch proves equivalent to the if/else chain; identity; detects wrong-default and swapped-arm counterexamples; gate rejects loops/memory, admits the switch. Also folds in cargo-fmt drift in loom-shared from #219 SROA @4356441. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 5 +- loom-core/src/verify.rs | 643 ++++++++++++++++++++++++++++++++++++++++ loom-shared/src/lib.rs | 4 +- 3 files changed, 648 insertions(+), 4 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index b0cac8f..2a25445 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -17835,7 +17835,10 @@ mod tests { true, ), // normal return — must NOT be no-return - ("(module (func (export \"f\") (result i32) i32.const 0))", false), + ( + "(module (func (export \"f\") (result i32) i32.const 0))", + false, + ), // ends in a value, not unreachable ("(module (func (export \"f\") nop))", false), // contains a branch (could escape to the fn label) — conservatively false diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index f07ddb7..1c7cec1 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7214,6 +7214,492 @@ fn encode_simple_instruction( Ok(()) } +// =========================================================================== +// PR-C (gale #219): precise acyclic control-flow symbolic execution. +// +// An ISOLATED executor that models structured, acyclic wasm control flow +// (block / if / br / br_if / br_table) by *state merging with ITE* driven by an +// explicit label/join stack — never path-forking. For a bounded acyclic region +// this is EXACT (a passing SMT check is a real proof, not a bounded +// approximation), and it produces ONE verification condition instead of N +// per-arm obligations. Precedent: Alive2 (PLDI'21, "we do not fork expressions +// across paths"; final state a linear chain of ITEs), Kuznetsov state-merging +// (PLDI'12), WASP br_table rules (ECOOP'22). See +// docs/design/pr-c-precise-cf-verification.md. +// +// Phase 1 = integer only (no memory): every non-control instruction must be in +// the `is_acyclic_simple_modelable` allowlist (the ops `encode_simple_instruction` +// models PRECISELY) and is delegated there. Loop (back-edge), Call/CallIndirect, +// memory ops, unknown opcodes, and multi-result/param blocks are REJECTED — the +// `is_acyclic_modelable_body` gate fails the whole function and the caller falls +// back (over-approximate-and-skip), NEVER a false equivalence. We must reject +// rather than delegate-and-hope because `encode_simple_instruction`'s catch-all +// HAVOCS (pushes a fresh BV without popping operands) → it would desync the +// stack for an unmodeled op. +// +// Currently `#[allow(dead_code)]`: built + tested in isolation (Z3 equivalence +// over hand-written br_table fixtures, incl. the 0x8000_0000-selector default +// case) BEFORE being wired into the live inline-verify path (PR-C M3). +// =========================================================================== + +/// Symbolic state threaded along the *current* straight-line path. +#[cfg(feature = "verification")] +#[allow(dead_code)] +struct AcyclicState { + stack: Vec, + locals: Vec, + globals: Vec, + /// Path condition guarding the current straight-line path. + path: Bool, + /// False once this path has branched / returned (subsequent straight-line + /// instructions in the same sequence are dead and must not be executed). + reachable: bool, +} + +/// One control label. Every branch targeting it accumulates an entry; at the +/// label's join point the entries ITE-merge into a single successor state. +/// `entries`: `(guard, result_values, locals_snapshot, globals_snapshot)`. +/// +/// One incoming branch to a label: its path guard plus the result/locals/globals +/// snapshot at the branch point. +#[cfg(feature = "verification")] +type BranchEntry = (Bool, Vec, Vec, Vec); + +/// The ITE-merged successor of a join: result values, locals, globals, and the +/// merged path condition. +#[cfg(feature = "verification")] +type MergedState = (Vec, Vec, Vec, Bool); + +#[cfg(feature = "verification")] +#[allow(dead_code)] +struct LabelJoin { + /// Number of result values the label carries (0 or 1 in Phase 1). + arity: usize, + entries: Vec, +} + +/// Result arity of a structured block type, or `Err` if it has parameters or +/// more than one result (rejected in Phase 1 — over-approximate-and-skip). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_block_arity(block_type: &BlockType) -> Result { + match block_type { + BlockType::Empty => Ok(0), + BlockType::Value(_) => Ok(1), + BlockType::Func { params, results } => { + if !params.is_empty() || results.len() > 1 { + Err(anyhow!( + "acyclic executor: block with params/multi-result not modeled" + )) + } else { + Ok(results.len()) + } + } + } +} + +/// Allowlist of non-control instructions `encode_simple_instruction` models +/// PRECISELY (pure integer ops + locals/globals, no memory, no havoc). Anything +/// outside this set is rejected by `is_acyclic_modelable_body`. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn is_acyclic_simple_modelable(instr: &Instruction) -> bool { + matches!( + instr, + Instruction::I32Const(_) + | Instruction::I64Const(_) + | Instruction::F32Const(_) + | Instruction::F64Const(_) + | Instruction::I32Add + | Instruction::I32Sub + | Instruction::I32Mul + | Instruction::I32And + | Instruction::I32Or + | Instruction::I32Xor + | Instruction::I32Shl + | Instruction::I32ShrS + | Instruction::I32ShrU + | Instruction::I32DivS + | Instruction::I32DivU + | Instruction::I32RemS + | Instruction::I32RemU + | Instruction::I32Eq + | Instruction::I32Ne + | Instruction::I32LtS + | Instruction::I32LtU + | Instruction::I32GtS + | Instruction::I32GtU + | Instruction::I32LeS + | Instruction::I32LeU + | Instruction::I32GeS + | Instruction::I32GeU + | Instruction::I32Eqz + | Instruction::I64Add + | Instruction::I64Sub + | Instruction::I64Mul + | Instruction::I64And + | Instruction::I64Or + | Instruction::I64Xor + | Instruction::I64Shl + | Instruction::I64ShrS + | Instruction::I64ShrU + | Instruction::I64DivS + | Instruction::I64DivU + | Instruction::I64RemS + | Instruction::I64RemU + | Instruction::I64Eq + | Instruction::I64Ne + | Instruction::I64LtS + | Instruction::I64LtU + | Instruction::I64GtS + | Instruction::I64GtU + | Instruction::I64LeS + | Instruction::I64LeU + | Instruction::I64GeS + | Instruction::I64GeU + | Instruction::I64Eqz + | Instruction::I32WrapI64 + | Instruction::I64ExtendI32S + | Instruction::I64ExtendI32U + | Instruction::LocalGet(_) + | Instruction::LocalSet(_) + | Instruction::LocalTee(_) + | Instruction::GlobalGet(_) + | Instruction::GlobalSet(_) + | Instruction::Drop + | Instruction::Nop + | Instruction::Select + ) +} + +/// Whole-function gate: true iff every instruction (recursing into block/if +/// bodies) is a modeled control op or an allowlisted simple op. Loops, calls, +/// memory, unknown opcodes, and unmodeled block shapes fail. The executor runs +/// ONLY on functions that pass this gate. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn is_acyclic_modelable_body(body: &[Instruction]) -> bool { + body.iter().all(|instr| match instr { + // Back-edges and calls are out of Phase 1 scope. + Instruction::Loop { .. } | Instruction::Call(_) | Instruction::CallIndirect { .. } => false, + Instruction::Block { block_type, body } => { + acyclic_block_arity(block_type).is_ok() && is_acyclic_modelable_body(body) + } + Instruction::If { + block_type, + then_body, + else_body, + } => { + acyclic_block_arity(block_type).is_ok() + && is_acyclic_modelable_body(then_body) + && is_acyclic_modelable_body(else_body) + } + Instruction::Br(_) + | Instruction::BrIf(_) + | Instruction::BrTable { .. } + | Instruction::Return => true, + other => is_acyclic_simple_modelable(other), + }) +} + +/// Resolve a relative branch depth to an absolute index in the label stack +/// (0 = innermost = top). `labels[0]` is the implicit function label. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_label_index(labels: &[LabelJoin], depth: u32) -> Result { + let depth = depth as usize; + if depth >= labels.len() { + return Err(anyhow!( + "acyclic executor: branch depth {} out of range", + depth + )); + } + Ok(labels.len() - 1 - depth) +} + +/// Record the current state as an incoming branch to `labels[target]`, carrying +/// that label's arity result values from the top of the stack. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_push_branch( + labels: &mut [LabelJoin], + target: usize, + guard: Bool, + state: &AcyclicState, +) -> Result<()> { + let arity = labels[target].arity; + if state.stack.len() < arity { + return Err(anyhow!("acyclic executor: stack underflow at branch")); + } + let results: Vec = state.stack[state.stack.len() - arity..].to_vec(); + labels[target] + .entries + .push((guard, results, state.locals.clone(), state.globals.clone())); + Ok(()) +} + +/// ITE-merge a label's incoming branches into one successor. Guards are pairwise +/// disjoint (if: cond vs ¬cond; br_table: sel==i vs sel≥n unsigned), so the +/// fold `ite(g0, v0, ite(g1, v1, … v_last))` selects the live branch under the +/// merged path (= disjunction of guards). Returns `(results, locals, globals, +/// path)`, or `None` if no branch reaches the label (unreachable join). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_merge_entries(entries: &[BranchEntry]) -> Option { + let (_, last_res, last_loc, last_glob) = entries.last()?; + let mut res = last_res.clone(); + let mut loc = last_loc.clone(); + let mut glob = last_glob.clone(); + let mut guards: Vec = vec![entries.last().unwrap().0.clone()]; + // Fold earlier entries outward: each takes precedence under its own guard. + for (g, r, l, gl) in entries[..entries.len() - 1].iter().rev() { + res = r + .iter() + .zip(res.iter()) + .map(|(t, f)| merge_bv(g, t, f)) + .collect(); + loc = l + .iter() + .zip(loc.iter()) + .map(|(t, f)| merge_bv(g, t, f)) + .collect(); + glob = gl + .iter() + .zip(glob.iter()) + .map(|(t, f)| merge_bv(g, t, f)) + .collect(); + guards.push(g.clone()); + } + let path = Bool::or(&guards); + Some((res, loc, glob, path)) +} + +/// Apply a popped label's join to the state: the post-block state is the +/// ITE-merge of all branches that targeted it, with the block's result values +/// pushed on top of `base_stack` (the operand stack below the block entry). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_apply_join( + state: &mut AcyclicState, + base_stack: Vec, + join: &LabelJoin, +) -> Result<()> { + match acyclic_merge_entries(&join.entries) { + None => { + // No branch reaches this join — code after the block is unreachable. + state.reachable = false; + Ok(()) + } + Some((results, locals, globals, path)) => { + let mut stack = base_stack; + stack.extend(results); + state.stack = stack; + state.locals = locals; + state.globals = globals; + state.path = path; + state.reachable = true; + Ok(()) + } + } +} + +/// Execute a structured, acyclic instruction sequence, merging branch states at +/// label joins. Precondition: the enclosing function passed +/// `is_acyclic_modelable_body`. `labels[0]` is the function label. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn exec_acyclic( + body: &[Instruction], + state: &mut AcyclicState, + labels: &mut Vec, +) -> Result<()> { + for instr in body { + if !state.reachable { + // Dead code after an unconditional branch / return. + break; + } + match instr { + Instruction::Block { block_type, body } => { + let arity = acyclic_block_arity(block_type)?; + let base_stack = state.stack.clone(); + labels.push(LabelJoin { + arity, + entries: Vec::new(), + }); + exec_acyclic(body, state, labels)?; + // Fall-through off the end == implicit `br 0` to this block. + if state.reachable { + let target = labels.len() - 1; + acyclic_push_branch(labels, target, state.path.clone(), state)?; + } + let join = labels.pop().unwrap(); + acyclic_apply_join(state, base_stack, &join)?; + } + Instruction::If { + block_type, + then_body, + else_body, + } => { + let arity = acyclic_block_arity(block_type)?; + let cond = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: If underflow"))?; + let cond_nz = cond.eq(BV::from_i64(0, 32)).not(); + let base_stack = state.stack.clone(); + let saved_locals = state.locals.clone(); + let saved_globals = state.globals.clone(); + let saved_path = state.path.clone(); + labels.push(LabelJoin { + arity, + entries: Vec::new(), + }); + let target = labels.len() - 1; + // then-branch under path ∧ cond≠0 + state.path = Bool::and(&[saved_path.clone(), cond_nz.clone()]); + state.reachable = true; + exec_acyclic(then_body, state, labels)?; + if state.reachable { + acyclic_push_branch(labels, target, state.path.clone(), state)?; + } + // else-branch under path ∧ cond=0 (restore entry state) + state.stack = base_stack.clone(); + state.locals = saved_locals; + state.globals = saved_globals; + state.path = Bool::and(&[saved_path, cond_nz.not()]); + state.reachable = true; + exec_acyclic(else_body, state, labels)?; + if state.reachable { + acyclic_push_branch(labels, target, state.path.clone(), state)?; + } + let join = labels.pop().unwrap(); + acyclic_apply_join(state, base_stack, &join)?; + } + Instruction::Br(depth) => { + let target = acyclic_label_index(labels, *depth)?; + acyclic_push_branch(labels, target, state.path.clone(), state)?; + state.reachable = false; + } + Instruction::BrIf(depth) => { + let cond = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: BrIf underflow"))?; + let cond_nz = cond.eq(BV::from_i64(0, 32)).not(); + let target = acyclic_label_index(labels, *depth)?; + let branch_guard = Bool::and(&[state.path.clone(), cond_nz.clone()]); + acyclic_push_branch(labels, target, branch_guard, state)?; + // Fall-through path narrows to cond=0. + state.path = Bool::and(&[state.path.clone(), cond_nz.not()]); + } + Instruction::BrTable { targets, default } => { + let sel = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: BrTable underflow"))?; + let n = targets.len() as u64; + // Arm i is taken iff sel == i. + for (i, d) in targets.iter().enumerate() { + let target = acyclic_label_index(labels, *d)?; + let guard = + Bool::and(&[state.path.clone(), sel.eq(BV::from_u64(i as u64, 32))]); + acyclic_push_branch(labels, target, guard, state)?; + } + // Default is taken iff sel >=u n (UNSIGNED — gale must-fix: a + // signed >= would leave selectors in [2^31, 2^32) in a gap + // between the arm guards and the default, letting Z3 pick any + // arm → a false equivalence). + let target = acyclic_label_index(labels, *default)?; + let guard = Bool::and(&[state.path.clone(), sel.bvuge(BV::from_u64(n, 32))]); + acyclic_push_branch(labels, target, guard, state)?; + state.reachable = false; + } + Instruction::Return => { + // Branch to the implicit function label. + acyclic_push_branch(labels, 0, state.path.clone(), state)?; + state.reachable = false; + } + other => { + // Defense in depth: never delegate an unmodeled op (the + // delegate's catch-all havocs and would desync the stack). + if !is_acyclic_simple_modelable(other) { + return Err(anyhow!( + "acyclic executor: instruction not modelable: {:?}", + other + )); + } + encode_simple_instruction( + other, + &mut state.stack, + &mut state.locals, + &mut state.globals, + )?; + } + } + } + Ok(()) +} + +/// Encode a function via the precise acyclic CF executor. Mirrors +/// `encode_function_to_smt`'s deterministic input naming (`param{i}`, +/// `global{i}`) so two encodings UNIFY in Z3 and can be compared directly. +/// Returns `Err` if the function is not acyclic-modelable (caller falls back). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn encode_acyclic_function(func: &Function) -> Result> { + if !is_acyclic_modelable_body(&func.instructions) { + return Err(anyhow!("function not acyclic-modelable")); + } + let arity = func.signature.results.len(); + if arity > 1 { + return Err(anyhow!("acyclic executor: multi-result not modeled")); + } + + let mut locals: Vec = Vec::new(); + for (idx, pt) in func.signature.params.iter().enumerate() { + locals.push(BV::new_const( + format!("param{}", idx), + bv_width_for_value_type(*pt), + )); + } + for (count, lt) in func.locals.iter() { + let width = bv_width_for_value_type(*lt); + for _ in 0..*count { + locals.push(BV::from_u64(0, width)); + } + } + let mut globals: Vec = Vec::new(); + for i in 0..16 { + globals.push(BV::new_const(format!("global{}", i), 32)); + } + + let mut state = AcyclicState { + stack: Vec::new(), + locals, + globals, + path: Bool::from_bool(true), + reachable: true, + }; + let mut labels = vec![LabelJoin { + arity, + entries: Vec::new(), + }]; + exec_acyclic(&func.instructions, &mut state, &mut labels)?; + // Fall-through off the function body == implicit return. + if state.reachable { + acyclic_push_branch(&mut labels, 0, state.path.clone(), &state)?; + } + let func_join = labels.pop().unwrap(); + + if arity == 0 { + return Ok(None); + } + match acyclic_merge_entries(&func_join.entries) { + Some((results, _, _, _)) => Ok(Some(results[0].clone())), + None => Err(anyhow!("acyclic executor: function has no reachable exit")), + } +} + /// Verify that block stack properties are preserved across optimization /// /// Uses Z3 to formally verify that a block transformation preserves: @@ -7442,6 +7928,163 @@ mod tests { use super::*; use crate::parse; + // ---- PR-C (gale #219) M1a: precise acyclic CF executor, tested in isolation ---- + + /// Encode both functions via the precise acyclic executor (deterministic + /// input names → they unify in Z3) and check semantic equivalence. + /// `Ok(true)` = proven equivalent, `Ok(false)` = counterexample found. + #[cfg(feature = "verification")] + fn acyclic_equiv(wat_a: &str, wat_b: &str) -> Result { + let ma = parse::parse_wat(wat_a).unwrap(); + let mb = parse::parse_wat(wat_b).unwrap(); + let fa = &ma.functions[0]; + let fb = &mb.functions[0]; + let cfg = z3_config_with_timeout(); + with_z3_config(&cfg, || { + let solver = Solver::new(); + let ra = encode_acyclic_function(fa)?; + let rb = encode_acyclic_function(fb)?; + match (ra, rb) { + (Some(a), Some(b)) => { + if a.get_size() != b.get_size() { + return Ok(false); + } + solver.assert(a.eq(&b).not()); + match solver.check() { + SatResult::Unsat => Ok(true), + SatResult::Sat => Ok(false), + SatResult::Unknown => Err(anyhow!("solver unknown")), + } + } + _ => Err(anyhow!("void/encode mismatch")), + } + }) + } + + /// 3-way `br_table` switch (0→10, 1→20, default→30). + #[cfg(feature = "verification")] + const SWITCH_10_20_30: &str = r#" + (module (func (param i32) (result i32) + (block (block (block + local.get 0 + br_table 0 1 2) + i32.const 10 + return) + i32.const 20 + return) + i32.const 30 + return))"#; + + /// Same dispatch expressed as an if/else chain — semantically identical. + #[cfg(feature = "verification")] + const IFELSE_10_20_30: &str = r#" + (module (func (param i32) (result i32) + local.get 0 + i32.eqz + if (result i32) + i32.const 10 + else + local.get 0 + i32.const 1 + i32.eq + if (result i32) + i32.const 20 + else + i32.const 30 + end + end))"#; + + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_brtable_equiv_ifelse() { + // The precise executor must model a br_table switch identically to the + // equivalent if/else chain (independent encodings → real proof). + assert!( + acyclic_equiv(SWITCH_10_20_30, IFELSE_10_20_30).unwrap(), + "br_table switch must prove equivalent to the if/else chain" + ); + } + + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_brtable_identity() { + // A switch is trivially equivalent to itself (sanity: the executor is + // deterministic and the merge is stable). + assert!(acyclic_equiv(SWITCH_10_20_30, SWITCH_10_20_30).unwrap()); + } + + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_brtable_detects_difference() { + // A switch whose default returns 99 instead of 30 must NOT prove + // equivalent — the executor must distinguish, not over-approve. + let switch_wrong_default = r#" + (module (func (param i32) (result i32) + (block (block (block + local.get 0 + br_table 0 1 2) + i32.const 10 + return) + i32.const 20 + return) + i32.const 99 + return))"#; + assert!( + !acyclic_equiv(SWITCH_10_20_30, switch_wrong_default).unwrap(), + "differing default arm must yield a counterexample (sel >= 2)" + ); + } + + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_brtable_detects_swapped_arms() { + // Swapping arm 0 and arm 1 results must be caught (sel==0 / sel==1). + let swapped = r#" + (module (func (param i32) (result i32) + (block (block (block + local.get 0 + br_table 0 1 2) + i32.const 20 + return) + i32.const 10 + return) + i32.const 30 + return))"#; + assert!( + !acyclic_equiv(SWITCH_10_20_30, swapped).unwrap(), + "swapped arm values must yield a counterexample" + ); + } + + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_modelable_gate() { + // The whole-function gate must reject loops, calls, and memory ops. + let with_loop = + parse::parse_wat("(module (func (param i32) (result i32) (loop local.get 0)))") + .unwrap(); + assert!( + !is_acyclic_modelable_body(&with_loop.functions[0].instructions), + "loops (back-edges) must be rejected" + ); + + let with_memory = parse::parse_wat( + "(module (memory 1) (func (param i32) (result i32) local.get 0 i32.load))", + ) + .unwrap(); + assert!( + !is_acyclic_modelable_body(&with_memory.functions[0].instructions), + "memory ops must be rejected in Phase 1" + ); + + // A pure-integer br_table switch must be admitted. + let switch = parse::parse_wat(SWITCH_10_20_30).unwrap(); + assert!( + is_acyclic_modelable_body(&switch.functions[0].instructions), + "pure-integer acyclic br_table function must be admitted" + ); + } + #[test] fn test_verify_constant_folding() { // Original: (i32.add (i32.const 2) (i32.const 3)) diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index 2b9dc3b..2cce36e 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -4482,9 +4482,7 @@ fn rewrite_pure_impl(val: Value) -> Value { let val_simplified = rewrite_pure(val.clone()); match val_simplified.data() { // Constant folding: extend_i32_u(i32.const N) → i64.const (zero-extended). - ValueData::I32Const { val: v } => { - iconst64(Imm64((v.value() as u32 as u64) as i64)) - } + ValueData::I32Const { val: v } => iconst64(Imm64((v.value() as u32 as u64) as i64)), _ => i64_extend_i32_u(val_simplified), } } From 793f801601cf2365f814e57634167e2af3988a5f Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 12:24:41 +0200 Subject: [PATCH 09/20] =?UTF-8?q?feat(219):=20PR-C=20M1b=20=E2=80=94=20Cal?= =?UTF-8?q?l=20by-body=20modeling=20+=20no-return=20diverge=20in=20the=20a?= =?UTF-8?q?cyclic=20executor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends exec_acyclic to model a `call F` BY BODY (the inline-soundness principle from #155 extended to control flow): pop args, recursively encode the callee through exec_acyclic with params bound to the args, declared locals zero-init, globals SHARED with the caller (callee global writes propagate back); push the merged single result and narrow the caller path to the callee's returning paths. A call to a no-return callee (is_noreturn_callee: panic -> unreachable) sets reachable=false — the path DIVERGES (drops out of the result ITE), NEVER havoc (Alive2 ub discipline; havoc reopens the #155/#159 false-positive surface). Threads VerificationSignatureContext (function_bodies, the existing #151 callee lookup) + a recursion depth into exec_acyclic / encode_acyclic_function / is_acyclic_modelable_body. MAX_ACYCLIC_CALL_DEPTH=8 bounds by-body inlining and makes a cyclic/recursive call graph fail safely (reject, never infinite-unroll). The gate now admits Call(g) iff g is a local function that is no-return or recursively acyclic-modelable; imports/call_indirect/over-deep stay rejected. exec_acyclic remains the soundness enforcer (Err, never havoc, on anything unmodelable). Still #[cfg(verification)] + #[allow(dead_code)] until M3 wires it into inline_functions' verify-or-revert. Tests (7 total, green): + call-by-body of a br_table callee proves equivalent to its inlined switch (the seam capability); + no-return call diverges not havoc. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/verify.rs | 252 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 229 insertions(+), 23 deletions(-) diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index 1c7cec1..47287d6 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7372,18 +7372,56 @@ fn is_acyclic_simple_modelable(instr: &Instruction) -> bool { ) } +/// Max by-body call-recursion depth the acyclic executor will model. The seam +/// decides call only `panic*` (no-return → diverge, no further recursion), so +/// depth 2 suffices; 8 gives headroom and bounds any cyclic/recursive call +/// graph (which is rejected, not unrolled). +#[cfg(feature = "verification")] +const MAX_ACYCLIC_CALL_DEPTH: usize = 8; + /// Whole-function gate: true iff every instruction (recursing into block/if -/// bodies) is a modeled control op or an allowlisted simple op. Loops, calls, -/// memory, unknown opcodes, and unmodeled block shapes fail. The executor runs -/// ONLY on functions that pass this gate. +/// bodies AND into by-body call targets) is a modeled control op, an allowlisted +/// simple op, or an admissible `Call`. Back-edges (Loop), call_indirect, memory, +/// unknown opcodes, unmodeled block shapes, imports, and over-deep/cyclic call +/// graphs all fail. A `Call(g)` is admissible iff `g` is a local function that +/// is itself no-return (diverges → modeled as ⊥) or recursively acyclic-modelable. +/// +/// This is a fast pre-check for the M3 routing decision; `exec_acyclic` is the +/// actual soundness enforcer (it returns `Err` on anything it cannot model +/// precisely, never havoc). #[cfg(feature = "verification")] #[allow(dead_code)] -fn is_acyclic_modelable_body(body: &[Instruction]) -> bool { +fn is_acyclic_modelable_body( + body: &[Instruction], + ctx: &VerificationSignatureContext, + depth: usize, +) -> bool { body.iter().all(|instr| match instr { - // Back-edges and calls are out of Phase 1 scope. - Instruction::Loop { .. } | Instruction::Call(_) | Instruction::CallIndirect { .. } => false, + // Back-edges and indirect calls are out of Phase 1 scope. + Instruction::Loop { .. } | Instruction::CallIndirect { .. } => false, + Instruction::Call(g) => { + if depth >= MAX_ACYCLIC_CALL_DEPTH { + return false; + } + match ctx + .function_bodies + .get(*g as usize) + .and_then(|o| o.as_deref()) + { + // No-return callee (panic→unreachable): modeled as ⊥ diverge. + Some(callee) if is_noreturn_callee(callee) => true, + // Otherwise the callee must itself be acyclic-modelable, and + // single-result (the executor pushes at most one result value). + Some(callee) => { + callee.signature.results.len() <= 1 + && is_acyclic_modelable_body(&callee.instructions, ctx, depth + 1) + } + // Import (None) or out-of-range: opaque, not modelable. + None => false, + } + } Instruction::Block { block_type, body } => { - acyclic_block_arity(block_type).is_ok() && is_acyclic_modelable_body(body) + acyclic_block_arity(block_type).is_ok() && is_acyclic_modelable_body(body, ctx, depth) } Instruction::If { block_type, @@ -7391,8 +7429,8 @@ fn is_acyclic_modelable_body(body: &[Instruction]) -> bool { else_body, } => { acyclic_block_arity(block_type).is_ok() - && is_acyclic_modelable_body(then_body) - && is_acyclic_modelable_body(else_body) + && is_acyclic_modelable_body(then_body, ctx, depth) + && is_acyclic_modelable_body(else_body, ctx, depth) } Instruction::Br(_) | Instruction::BrIf(_) @@ -7512,6 +7550,8 @@ fn exec_acyclic( body: &[Instruction], state: &mut AcyclicState, labels: &mut Vec, + ctx: &VerificationSignatureContext, + depth: usize, ) -> Result<()> { for instr in body { if !state.reachable { @@ -7526,7 +7566,7 @@ fn exec_acyclic( arity, entries: Vec::new(), }); - exec_acyclic(body, state, labels)?; + exec_acyclic(body, state, labels, ctx, depth)?; // Fall-through off the end == implicit `br 0` to this block. if state.reachable { let target = labels.len() - 1; @@ -7558,7 +7598,7 @@ fn exec_acyclic( // then-branch under path ∧ cond≠0 state.path = Bool::and(&[saved_path.clone(), cond_nz.clone()]); state.reachable = true; - exec_acyclic(then_body, state, labels)?; + exec_acyclic(then_body, state, labels, ctx, depth)?; if state.reachable { acyclic_push_branch(labels, target, state.path.clone(), state)?; } @@ -7568,7 +7608,7 @@ fn exec_acyclic( state.globals = saved_globals; state.path = Bool::and(&[saved_path, cond_nz.not()]); state.reachable = true; - exec_acyclic(else_body, state, labels)?; + exec_acyclic(else_body, state, labels, ctx, depth)?; if state.reachable { acyclic_push_branch(labels, target, state.path.clone(), state)?; } @@ -7619,6 +7659,23 @@ fn exec_acyclic( acyclic_push_branch(labels, 0, state.path.clone(), state)?; state.reachable = false; } + Instruction::Call(g) => { + let callee = ctx + .function_bodies + .get(*g as usize) + .and_then(|o| o.as_deref()) + .ok_or_else(|| { + anyhow!("acyclic executor: call to import/unknown func {}", g) + })?; + if is_noreturn_callee(callee) { + // ⊥-diverge: control never returns (panic→unreachable). The + // path traps and drops out of the result ITE — NEVER havoc + // (Alive2 `ub` discipline; havoc reopens #155/#159). + state.reachable = false; + } else { + encode_acyclic_call(callee, state, ctx, depth)?; + } + } other => { // Defense in depth: never delegate an unmodeled op (the // delegate's catch-all havocs and would desync the stack). @@ -7640,14 +7697,94 @@ fn exec_acyclic( Ok(()) } +/// Model a `call` to `callee` BY BODY: pop the args, recursively encode the +/// callee through `exec_acyclic` (params bound to the args, declared locals +/// zero-init, globals SHARED with the caller so the callee's global writes +/// propagate back), and push the merged single result. Sets `reachable=false` +/// if every callee path traps (no normal exit). The SAME executor models both +/// the original `call F` (here) and the inlined body, so they yield the same +/// expression → the inline is provable. +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn encode_acyclic_call( + callee: &Function, + state: &mut AcyclicState, + ctx: &VerificationSignatureContext, + depth: usize, +) -> Result<()> { + if depth + 1 >= MAX_ACYCLIC_CALL_DEPTH { + return Err(anyhow!("acyclic executor: call recursion too deep")); + } + let arity = callee.signature.results.len(); + if arity > 1 { + return Err(anyhow!("acyclic executor: multi-result callee")); + } + let nparams = callee.signature.params.len(); + if state.stack.len() < nparams { + return Err(anyhow!("acyclic executor: call arg underflow")); + } + // Pop args (last param on top of stack); bind to the callee's param locals. + let mut callee_locals: Vec = state.stack.split_off(state.stack.len() - nparams); + for (count, lt) in callee.locals.iter() { + let width = bv_width_for_value_type(*lt); + for _ in 0..*count { + callee_locals.push(BV::from_u64(0, width)); + } + } + let mut sub = AcyclicState { + stack: Vec::new(), + locals: callee_locals, + // SHARED globals — the callee may read/write module globals; the merged + // result threads the updated globals back to the caller. + globals: state.globals.clone(), + // Inherit the caller's path so callee branch guards conjoin it. + path: state.path.clone(), + reachable: true, + }; + let mut sub_labels = vec![LabelJoin { + arity, + entries: Vec::new(), + }]; + exec_acyclic( + &callee.instructions, + &mut sub, + &mut sub_labels, + ctx, + depth + 1, + )?; + if sub.reachable { + acyclic_push_branch(&mut sub_labels, 0, sub.path.clone(), &sub)?; + } + let join = sub_labels.pop().unwrap(); + match acyclic_merge_entries(&join.entries) { + Some((results, _locals, globals, path)) => { + // Propagate callee global writes; narrow the caller path to the + // callee's returning paths. + state.globals = globals; + state.path = path; + if arity == 1 { + state.stack.push(results[0].clone()); + } + } + None => { + // Callee never returns normally (all paths trap) → caller diverges. + state.reachable = false; + } + } + Ok(()) +} + /// Encode a function via the precise acyclic CF executor. Mirrors /// `encode_function_to_smt`'s deterministic input naming (`param{i}`, /// `global{i}`) so two encodings UNIFY in Z3 and can be compared directly. /// Returns `Err` if the function is not acyclic-modelable (caller falls back). #[cfg(feature = "verification")] #[allow(dead_code)] -fn encode_acyclic_function(func: &Function) -> Result> { - if !is_acyclic_modelable_body(&func.instructions) { +fn encode_acyclic_function( + func: &Function, + ctx: &VerificationSignatureContext, +) -> Result> { + if !is_acyclic_modelable_body(&func.instructions, ctx, 0) { return Err(anyhow!("function not acyclic-modelable")); } let arity = func.signature.results.len(); @@ -7684,7 +7821,7 @@ fn encode_acyclic_function(func: &Function) -> Result> { arity, entries: Vec::new(), }]; - exec_acyclic(&func.instructions, &mut state, &mut labels)?; + exec_acyclic(&func.instructions, &mut state, &mut labels, ctx, 0)?; // Fall-through off the function body == implicit return. if state.reachable { acyclic_push_branch(&mut labels, 0, state.path.clone(), &state)?; @@ -7935,15 +8072,24 @@ mod tests { /// `Ok(true)` = proven equivalent, `Ok(false)` = counterexample found. #[cfg(feature = "verification")] fn acyclic_equiv(wat_a: &str, wat_b: &str) -> Result { + acyclic_equiv_at(wat_a, 0, wat_b, 0) + } + + /// As `acyclic_equiv` but comparing a chosen function index in each module + /// (so a caller that `call`s a sibling can be compared to its inlined form). + #[cfg(feature = "verification")] + fn acyclic_equiv_at(wat_a: &str, ia: usize, wat_b: &str, ib: usize) -> Result { let ma = parse::parse_wat(wat_a).unwrap(); let mb = parse::parse_wat(wat_b).unwrap(); - let fa = &ma.functions[0]; - let fb = &mb.functions[0]; + let ctx_a = VerificationSignatureContext::from_module(&ma); + let ctx_b = VerificationSignatureContext::from_module(&mb); + let fa = &ma.functions[ia]; + let fb = &mb.functions[ib]; let cfg = z3_config_with_timeout(); with_z3_config(&cfg, || { let solver = Solver::new(); - let ra = encode_acyclic_function(fa)?; - let rb = encode_acyclic_function(fb)?; + let ra = encode_acyclic_function(fa, &ctx_a)?; + let rb = encode_acyclic_function(fb, &ctx_b)?; match (ra, rb) { (Some(a), Some(b)) => { if a.get_size() != b.get_size() { @@ -8059,12 +8205,13 @@ mod tests { #[test] #[cfg(feature = "verification")] fn test_acyclic_modelable_gate() { - // The whole-function gate must reject loops, calls, and memory ops. + // The whole-function gate must reject loops and memory ops. let with_loop = parse::parse_wat("(module (func (param i32) (result i32) (loop local.get 0)))") .unwrap(); + let ctx_loop = VerificationSignatureContext::from_module(&with_loop); assert!( - !is_acyclic_modelable_body(&with_loop.functions[0].instructions), + !is_acyclic_modelable_body(&with_loop.functions[0].instructions, &ctx_loop, 0), "loops (back-edges) must be rejected" ); @@ -8072,19 +8219,78 @@ mod tests { "(module (memory 1) (func (param i32) (result i32) local.get 0 i32.load))", ) .unwrap(); + let ctx_mem = VerificationSignatureContext::from_module(&with_memory); assert!( - !is_acyclic_modelable_body(&with_memory.functions[0].instructions), + !is_acyclic_modelable_body(&with_memory.functions[0].instructions, &ctx_mem, 0), "memory ops must be rejected in Phase 1" ); // A pure-integer br_table switch must be admitted. let switch = parse::parse_wat(SWITCH_10_20_30).unwrap(); + let ctx_sw = VerificationSignatureContext::from_module(&switch); assert!( - is_acyclic_modelable_body(&switch.functions[0].instructions), + is_acyclic_modelable_body(&switch.functions[0].instructions, &ctx_sw, 0), "pure-integer acyclic br_table function must be admitted" ); } + /// M1b: a caller that `call`s a br_table callee must prove equivalent to the + /// same logic with the callee inlined — i.e. the executor models `call F` + /// by-body identically to the inlined body (the inline-soundness principle + /// extended to control flow). func 0 = decide (switch), func 1 = z (calls it). + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_call_by_body_equiv_inlined() { + let caller_with_call = r#" + (module + (func (param i32) (result i32) + (block (block (block + local.get 0 + br_table 0 1 2) + i32.const 10 + return) + i32.const 20 + return) + i32.const 30 + return) + (func (param i32) (result i32) + local.get 0 + call 0))"#; + // z (func 1) calls decide (func 0); compare to the inlined switch. + assert!( + acyclic_equiv_at(caller_with_call, 1, SWITCH_10_20_30, 0).unwrap(), + "call-by-body must prove equivalent to the inlined callee body" + ); + } + + /// M1b: a call to a no-return callee (panic→unreachable) must be modeled as + /// ⊥ (diverge), never havoc. A function that returns 7 on one path and traps + /// on the other proves equivalent to one that simply returns 7 (the trapping + /// path drops out of the result; both sides agree on the returning path). + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_noreturn_call_diverges() { + // func 0 = panic (no params, unreachable); func 1 = g: if arg!=0 return 7 + // else call panic (diverge). func 1 should equal "always return 7". + let with_trap = r#" + (module + (func unreachable) + (func (param i32) (result i32) + local.get 0 + if (result i32) + i32.const 7 + else + call 0 + i32.const 7 + end))"#; + let always7 = r#" + (module (func (param i32) (result i32) i32.const 7))"#; + assert!( + acyclic_equiv_at(with_trap, 1, always7, 0).unwrap(), + "no-return call path must diverge (drop out), leaving only the return-7 path" + ); + } + #[test] fn test_verify_constant_folding() { // Original: (i32.add (i32.const 2) (i32.const 3)) From eff198f117cbcbd8073f11cce44cf1c188000be5 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 13:50:08 +0200 Subject: [PATCH 10/20] =?UTF-8?q?feat(219):=20PR-C=20M3.1=20=E2=80=94=20ve?= =?UTF-8?q?rify=5Facyclic=5Fequivalence=20precise=20fast-path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the equivalence harness over the precise acyclic CF executor: encodes both functions via encode_acyclic_function (deterministic input names unify in Z3) and checks a == b has no counterexample. Mirrors verify_function_equivalence's soundness boundaries (no width-matching the final equality; Unknown -> Err). Returns Err whenever either side is not acyclic-modelable, a width mismatches, or the encoding fails — so the caller (inline_functions' verify-or-revert, wired in M3.2) FALLS BACK to the existing encoder. This only ADDS proving power for the br_table-callee inline case; it never removes a check. Void functions defer to the fallback (no side-effect equivalence modeled yet). Still #[allow(dead_code)] until M3.2 wires it into inline_functions. Tests (9 total, green): + test_verify_acyclic_equivalence_inline — a caller that calls a br_table decide verifies equivalent to the inlined switch, a wrong-default body does NOT verify, and a loop function Errs (caller falls back). Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/verify.rs | 119 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index 47287d6..efe6efa 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7837,6 +7837,70 @@ fn encode_acyclic_function( } } +/// PR-C (#219) M3: precise FAST PATH for proving `original ≡ optimized` when +/// BOTH functions are acyclic-CF-modelable (integer, block/if/br/br_if/br_table, +/// by-body calls, no loops/memory). Encodes both via the precise acyclic +/// executor (deterministic input names → they unify in Z3) and checks semantic +/// equivalence (`a == b` has no counterexample). This is what lets the verified +/// inliner prove a `call F` (with a `br_table` callee) equal to its inlined +/// body — the case the straight-line by-body modeler (`callee_inlinable_by_body`) +/// cannot reach. +/// +/// Returns `Err` if either function is not acyclic-modelable, if a result width +/// mismatches, or if the encoding fails — the caller then FALLS BACK to the +/// existing encoder-based `verify_function_equivalence` (no behavior change for +/// non-acyclic functions; this only ADDS proving power, never removes a check). +#[cfg(feature = "verification")] +#[allow(dead_code)] // wired into inline_functions' verify-or-revert in M3 step 2 +pub(crate) fn verify_acyclic_equivalence( + original: &Function, + optimized: &Function, + ctx: &VerificationSignatureContext, +) -> Result { + if original.signature.params != optimized.signature.params + || original.signature.results != optimized.signature.results + { + return Err(anyhow!("acyclic verify: signature changed")); + } + // Only handle the precise acyclic case; anything else → caller falls back. + if !is_acyclic_modelable_body(&original.instructions, ctx, 0) + || !is_acyclic_modelable_body(&optimized.instructions, ctx, 0) + { + return Err(anyhow!("acyclic verify: not acyclic-modelable")); + } + + let cfg = z3_config_with_timeout(); + with_z3_config(&cfg, || { + let solver = Solver::new(); + let orig_result = encode_acyclic_function(original, ctx); + let opt_result = encode_acyclic_function(optimized, ctx); + match (&orig_result, &opt_result) { + (Ok(Some(orig)), Ok(Some(opt))) => { + // loom#145 soundness boundary: never width-match the final + // equality — a width mismatch is a model artifact we cannot + // soundly resolve, so bail to "not proven" and let the caller + // fall back / revert. + if orig.get_size() != opt.get_size() { + return Err(anyhow!("acyclic verify: result width mismatch")); + } + solver.assert(orig.eq(opt).not()); + match solver.check() { + SatResult::Unsat => Ok(true), + SatResult::Sat => Ok(false), + SatResult::Unknown => Err(anyhow!("acyclic verify: solver unknown (timeout)")), + } + } + // Both void: no return value to compare. The acyclic executor does + // not yet model side-effect (global/memory) equivalence for void + // functions, so defer to the existing verifier — signal fall-back. + (Ok(None), Ok(None)) => Err(anyhow!("acyclic verify: void — defer to fallback")), + (Err(e), _) => Err(anyhow!("acyclic verify: encode original: {}", e)), + (_, Err(e)) => Err(anyhow!("acyclic verify: encode optimized: {}", e)), + _ => Err(anyhow!("acyclic verify: return-type mismatch")), + } + }) +} + /// Verify that block stack properties are preserved across optimization /// /// Uses Z3 to formally verify that a block transformation preserves: @@ -8291,6 +8355,61 @@ mod tests { ); } + /// M3: the public verify_acyclic_equivalence fast-path proves a caller that + /// `call`s a br_table decide equivalent to the inlined switch, and rejects a + /// non-equivalent optimized body — the exact inline verify-or-revert pattern. + #[test] + #[cfg(feature = "verification")] + fn test_verify_acyclic_equivalence_inline() { + let module_wat = r#" + (module + (func (param i32) (result i32) + (block (block (block + local.get 0 + br_table 0 1 2) + i32.const 10 + return) + i32.const 20 + return) + i32.const 30 + return) + (func (param i32) (result i32) + local.get 0 + call 0))"#; + let m = parse::parse_wat(module_wat).unwrap(); + let ctx = VerificationSignatureContext::from_module(&m); + let z = &m.functions[1]; // caller: local.get 0; call decide + + // Equivalent: the inlined switch. + let inlined = parse::parse_wat(SWITCH_10_20_30).unwrap(); + assert!( + verify_acyclic_equivalence(z, &inlined.functions[0], &ctx).unwrap(), + "z (calls decide) must verify ≡ the inlined switch" + ); + + // Non-equivalent: a switch with a wrong default (99) must NOT verify. + let wrong = parse::parse_wat( + r#"(module (func (param i32) (result i32) + (block (block (block local.get 0 br_table 0 1 2) + i32.const 10 return) i32.const 20 return) i32.const 99 return))"#, + ) + .unwrap(); + assert!( + !verify_acyclic_equivalence(z, &wrong.functions[0], &ctx).unwrap(), + "a wrong-default body must NOT verify equivalent" + ); + + // A function with a loop is not acyclic-modelable → Err (caller falls back). + let loopy = parse::parse_wat( + "(module (func (param i32) (result i32) (loop (result i32) local.get 0)))", + ) + .unwrap(); + assert!( + verify_acyclic_equivalence(&loopy.functions[0], &loopy.functions[0], &ctx).is_err(), + "non-acyclic functions must Err so the caller falls back" + ); + } + #[test] fn test_verify_constant_folding() { // Original: (i32.add (i32.const 2) (i32.const 3)) From 8260ac8edd2ada60b3ad7ae1ece09351404894c6 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 15:12:30 +0200 Subject: [PATCH 11/20] =?UTF-8?q?feat(219):=20PR-C=20M3.2=20=E2=80=94=20wi?= =?UTF-8?q?re=20the=20precise=20acyclic=20fast-path=20into=20the=20inliner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TranslationValidator::verify() now tries verify_acyclic_equivalence for the inline pass before the existing encoder. The main encoder models br_table approximately and a br_table callee is not straight-line-by-body-modelable, so it cannot prove `call F` (br_table callee) equal to its inlined body → the inline reverts. The acyclic executor is EXACT for acyclic CF + by-body calls, so it can. PURELY ADDITIVE + scoped to pass_name == "inline_functions": ONLY a precise Ok(true) short-circuits to accept; Ok(false)/Err/panic FALL THROUGH to the existing verify_function_equivalence_with_context unchanged (acyclic call wrapped in its own catch_unwind). This can only ADD accepts (the br_table-callee inlines), never weaken an existing check; every non-inline pass is byte-identical. This also makes the M1a/M1b/M3.1 executor chain live (the #[allow(dead_code)] markers are now harmless no-ops). No br_table callee is admitted as an inline CANDIDATE yet (function_inline_modelable is still straight-line — M3.3), so this is a behavioral no-op on the current corpus: full `cargo test --release --lib` is 408 passed / 0 failed / 2 ignored, unchanged. clippy + fmt clean. The capability is proven by the 9 acyclic unit tests; M3.3 enables it end-to-end on the seam. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/verify.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index efe6efa..119f19c 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -3009,6 +3009,31 @@ impl TranslationValidator { return Ok(()); } + // PR-C (#219) M3.2: precise acyclic-CF fast-path for the inliner. The + // main encoder models br_table approximately and a br_table callee is + // not straight-line-by-body-modelable, so it cannot prove `call F` + // (br_table callee) equal to its inlined body → the inline reverts. The + // acyclic executor is EXACT for acyclic CF + by-body calls. Try it here. + // + // PURELY ADDITIVE + scoped to the inliner: ONLY a precise Ok(true) (a + // real proof) short-circuits to accept; Ok(false)/Err/panic FALL THROUGH + // to the existing encoder unchanged. So this can only ADD accepts (the + // br_table-callee inlines), never weaken an existing check, and every + // non-inline pass is byte-identical. (catch_unwind: the acyclic encoder + // builds Z3 exprs and could panic on a malformed input.) + if self.pass_name == "inline_functions" { + let original = self.original.clone(); + let sig_ctx = self.sig_ctx.clone(); + let optimized_clone = optimized.clone(); + let acyclic = std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || { + verify_acyclic_equivalence(&original, &optimized_clone, &sig_ctx) + })); + if let Ok(Ok(true)) = acyclic { + return Ok(()); + } + // else: fall through to the existing encoder (no behavior change). + } + // Use catch_unwind to handle Z3 internal panics gracefully. (The // i64 inline hang in loom#147 turned out NOT to be a verify hang — // verify returns fast; it was a livelock in the inline-pass From 2493ad588784ea3151e5f04f0bb5554a81ca0c50 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 16:48:33 +0200 Subject: [PATCH 12/20] =?UTF-8?q?feat(219):=20PR-C=20M3.3=20=E2=80=94=20ad?= =?UTF-8?q?mit=20acyclic-CF=20callees=20as=20inline=20candidates=20+=20Unr?= =?UTF-8?q?eachable=20trap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit function_inline_modelable now admits a callee under EITHER regime A (straight-line incl. memory — the existing by-body modeler, #155/#157/#161) OR regime B (acyclic control flow + pure-integer ops + by-body calls to no-return/recursively-modelable local callees — the precise acyclic executor, PR-C). Mirrors verify::is_acyclic_modelable_body so the candidate gate admits exactly what the acyclic verifier can prove (admitting an unprovable callee → caller reverts, the #159 collateral problem). Threads module context (functions + num_imported_funcs) into the gate + a recursion-depth bound; a br_table+memory callee passes neither regime → stays opaque until Phase 2. Also handles Unreachable as a trap (⊥) in exec_acyclic and both gates: the path diverges (reachable=false, drops out of the result ITE), never havoc — the seam decides end their overflow arms in `call panic_const; unreachable`, so this is required to model them at all. Full `cargo test --release --lib` = 409 passed / 0 failed / 2 ignored (+1: test_acyclic_unreachable_diverges). fmt + clippy clean. SCOPE NOTE (found by inspecting the real repro-219/sem.loom.wasm): decide (gale_k_sem_give_decide) IS regime-B-eligible (acyclic br_table + i64 pack + no-return panic call, no memory), so it is now an inline candidate. BUT its caller z_impl_k_sem_give READS MEMORY (i32.load offset=8/12 for decide's args) and calls imports, so proving decide's inline INTO z_impl needs the acyclic executor to model z_impl's memory = PHASE 2. So sem_give's seam will not dissolve until Phase 2 (memory Array threaded through exec_acyclic). M3.1-M3.3 are correct + necessary infra; Phase 2 is the next real enabler for sem_give. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 352 +++++++++++++++++++++++++++++----------- loom-core/src/verify.rs | 31 +++- 2 files changed, 290 insertions(+), 93 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 2a25445..650d21d 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -13447,7 +13447,7 @@ pub mod optimize { .functions .get((*func_idx - num_imported_funcs) as usize) { - if !function_inline_modelable(callee) { + if !function_inline_modelable(callee, &module.functions, num_imported_funcs) { continue; } } @@ -13551,100 +13551,268 @@ pub mod optimize { Ok(()) } - /// loom#159: is this callee FULLY by-body-inline-modelable — i.e. will the - /// translation validator be able to PROVE its inline? - /// - /// A callee is only a sound inline candidate if the verifier can model its - /// body by-body (loom#151/#155/#157). If it contains ANY op the verifier - /// can't model — partial-width loads/stores (`*.load8/16`, `*.store8/16/32`, - /// e.g. `filter_step`'s `i32.load16_s`), floats, globals, `memory.*` bulk - /// ops, calls (non-leaf), or control flow — then inlining it is unprovable: - /// the original models the call as a havoc while the optimized has concrete - /// effects, so a downstream read diverges → FALSE counterexample → the whole - /// caller reverts, dragging down any *other* inline in it too (the v1.1.7 - /// regression: a partial-width `filter_step` killed the `controller_step` - /// reader-inline). Excluding such callees keeps them opaque (havoc) calls, - /// which is exactly what a following modelable reader's inline needs. - /// - /// This MUST stay in lock-step with `verify::is_inline_modelable_instr` - /// (the verifier's by-body whitelist). Straight-line only: any control flow - /// or call disqualifies the callee. - fn function_inline_modelable(func: &super::Function) -> bool { - func.instructions.iter().all(|i| { - matches!( - i, - Instruction::I32Const(_) - | Instruction::I64Const(_) - | Instruction::LocalGet(_) - | Instruction::LocalSet(_) - | Instruction::LocalTee(_) - | Instruction::Drop - | Instruction::Select - | Instruction::I32Load { .. } - | Instruction::I64Load { .. } - | Instruction::I32Store { .. } - | Instruction::I64Store { .. } - | Instruction::I32Load8S { .. } - | Instruction::I32Load8U { .. } - | Instruction::I32Load16S { .. } - | Instruction::I32Load16U { .. } - | Instruction::I32Store8 { .. } - | Instruction::I32Store16 { .. } - | Instruction::I32Add - | Instruction::I32Sub - | Instruction::I32Mul - | Instruction::I32And - | Instruction::I32Or - | Instruction::I32Xor - | Instruction::I32Shl - | Instruction::I32ShrS - | Instruction::I32ShrU - | Instruction::I64Add - | Instruction::I64Sub - | Instruction::I64Mul - | Instruction::I64And - | Instruction::I64Or - | Instruction::I64Xor - | Instruction::I64Shl - | Instruction::I64ShrS - | Instruction::I64ShrU - | Instruction::I32DivS - | Instruction::I32DivU - | Instruction::I32RemS - | Instruction::I32RemU - | Instruction::I64DivS - | Instruction::I64DivU - | Instruction::I64RemS - | Instruction::I64RemU - | Instruction::I32Eq - | Instruction::I32Ne - | Instruction::I32LtS - | Instruction::I32LtU - | Instruction::I32GtS - | Instruction::I32GtU - | Instruction::I32LeS - | Instruction::I32LeU - | Instruction::I32GeS - | Instruction::I32GeU - | Instruction::I32Eqz - | Instruction::I64Eq - | Instruction::I64Ne - | Instruction::I64LtS - | Instruction::I64LtU - | Instruction::I64GtS - | Instruction::I64GtU - | Instruction::I64LeS - | Instruction::I64LeU - | Instruction::I64GeS - | Instruction::I64GeU - | Instruction::I64Eqz - | Instruction::I64ExtendI32S - | Instruction::I64ExtendI32U - | Instruction::I32WrapI64 - ) + /// REGIME A whitelist: per-instruction straight-line ops the by-body + /// modeler (`verify::encode_inlinable_callee_result`) proves faithfully, + /// INCLUDING full/partial-width memory loads/stores (loom#151/#155/#157/#161). + /// No control flow, no calls. MUST stay in lock-step with + /// `verify::is_inline_modelable_instr`. Used by `function_inline_modelable`. + fn is_straightline_modelable_instr(i: &Instruction) -> bool { + matches!( + i, + Instruction::I32Const(_) + | Instruction::I64Const(_) + | Instruction::LocalGet(_) + | Instruction::LocalSet(_) + | Instruction::LocalTee(_) + | Instruction::Drop + | Instruction::Select + | Instruction::I32Load { .. } + | Instruction::I64Load { .. } + | Instruction::I32Store { .. } + | Instruction::I64Store { .. } + | Instruction::I32Load8S { .. } + | Instruction::I32Load8U { .. } + | Instruction::I32Load16S { .. } + | Instruction::I32Load16U { .. } + | Instruction::I32Store8 { .. } + | Instruction::I32Store16 { .. } + | Instruction::I32Add + | Instruction::I32Sub + | Instruction::I32Mul + | Instruction::I32And + | Instruction::I32Or + | Instruction::I32Xor + | Instruction::I32Shl + | Instruction::I32ShrS + | Instruction::I32ShrU + | Instruction::I64Add + | Instruction::I64Sub + | Instruction::I64Mul + | Instruction::I64And + | Instruction::I64Or + | Instruction::I64Xor + | Instruction::I64Shl + | Instruction::I64ShrS + | Instruction::I64ShrU + | Instruction::I32DivS + | Instruction::I32DivU + | Instruction::I32RemS + | Instruction::I32RemU + | Instruction::I64DivS + | Instruction::I64DivU + | Instruction::I64RemS + | Instruction::I64RemU + | Instruction::I32Eq + | Instruction::I32Ne + | Instruction::I32LtS + | Instruction::I32LtU + | Instruction::I32GtS + | Instruction::I32GtU + | Instruction::I32LeS + | Instruction::I32LeU + | Instruction::I32GeS + | Instruction::I32GeU + | Instruction::I32Eqz + | Instruction::I64Eq + | Instruction::I64Ne + | Instruction::I64LtS + | Instruction::I64LtU + | Instruction::I64GtS + | Instruction::I64GtU + | Instruction::I64LeS + | Instruction::I64LeU + | Instruction::I64GeS + | Instruction::I64GeU + | Instruction::I64Eqz + | Instruction::I64ExtendI32S + | Instruction::I64ExtendI32U + | Instruction::I32WrapI64 + ) + } + + /// REGIME B per-instruction whitelist: pure-integer ops the precise acyclic + /// executor (`verify::exec_acyclic`) models — NO memory (Phase 1), NO + /// floats/unknown. MUST stay in lock-step with + /// `verify::is_acyclic_simple_modelable`. + fn is_acyclic_int_modelable_instr(i: &Instruction) -> bool { + matches!( + i, + Instruction::I32Const(_) + | Instruction::I64Const(_) + | Instruction::LocalGet(_) + | Instruction::LocalSet(_) + | Instruction::LocalTee(_) + | Instruction::GlobalGet(_) + | Instruction::GlobalSet(_) + | Instruction::Drop + | Instruction::Nop + | Instruction::Select + | Instruction::I32Add + | Instruction::I32Sub + | Instruction::I32Mul + | Instruction::I32And + | Instruction::I32Or + | Instruction::I32Xor + | Instruction::I32Shl + | Instruction::I32ShrS + | Instruction::I32ShrU + | Instruction::I32DivS + | Instruction::I32DivU + | Instruction::I32RemS + | Instruction::I32RemU + | Instruction::I64Add + | Instruction::I64Sub + | Instruction::I64Mul + | Instruction::I64And + | Instruction::I64Or + | Instruction::I64Xor + | Instruction::I64Shl + | Instruction::I64ShrS + | Instruction::I64ShrU + | Instruction::I64DivS + | Instruction::I64DivU + | Instruction::I64RemS + | Instruction::I64RemU + | Instruction::I32Eq + | Instruction::I32Ne + | Instruction::I32LtS + | Instruction::I32LtU + | Instruction::I32GtS + | Instruction::I32GtU + | Instruction::I32LeS + | Instruction::I32LeU + | Instruction::I32GeS + | Instruction::I32GeU + | Instruction::I32Eqz + | Instruction::I64Eq + | Instruction::I64Ne + | Instruction::I64LtS + | Instruction::I64LtU + | Instruction::I64GtS + | Instruction::I64GtU + | Instruction::I64LeS + | Instruction::I64LeU + | Instruction::I64GeS + | Instruction::I64GeU + | Instruction::I64Eqz + | Instruction::I64ExtendI32S + | Instruction::I64ExtendI32U + | Instruction::I32WrapI64 + ) + } + + /// loom-side mirror of `verify::is_noreturn_callee` (that one is + /// `#[cfg(feature = "verification")]`-gated, unavailable here). True if every + /// path traps: no `Return`/`Br*` anywhere (recursing into nested CF) and the + /// body ends in `Unreachable`. Admits exactly the `panic*` helpers. + fn inline_callee_is_noreturn(func: &super::Function) -> bool { + fn has_branch_or_return(instrs: &[Instruction]) -> bool { + instrs.iter().any(|i| match i { + Instruction::Return + | Instruction::Br(_) + | Instruction::BrIf(_) + | Instruction::BrTable { .. } => true, + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + has_branch_or_return(body) + } + Instruction::If { + then_body, + else_body, + .. + } => has_branch_or_return(then_body) || has_branch_or_return(else_body), + _ => false, + }) + } + !has_branch_or_return(&func.instructions) + && matches!(func.instructions.last(), Some(Instruction::Unreachable)) + } + + /// Max by-body call-recursion depth for the regime-B check (mirrors + /// `verify::MAX_ACYCLIC_CALL_DEPTH`); bounds cyclic call graphs. + const INLINE_ACYCLIC_MAX_DEPTH: usize = 8; + + /// REGIME B: acyclic control flow (block/if/br/br_if/br_table) + pure-integer + /// ops + by-body calls (to no-return or recursively-modelable LOCAL callees). + /// NO memory (Phase 1). Mirrors `verify::is_acyclic_modelable_body` so the + /// inline candidate gate admits exactly what the acyclic verifier can prove. + fn inline_modelable_acyclic_body( + body: &[Instruction], + all_functions: &[super::Function], + num_imported_funcs: u32, + depth: usize, + ) -> bool { + body.iter().all(|i| match i { + Instruction::Block { body, .. } => { + inline_modelable_acyclic_body(body, all_functions, num_imported_funcs, depth) + } + Instruction::If { + then_body, + else_body, + .. + } => { + inline_modelable_acyclic_body(then_body, all_functions, num_imported_funcs, depth) + && inline_modelable_acyclic_body( + else_body, + all_functions, + num_imported_funcs, + depth, + ) + } + Instruction::Br(_) + | Instruction::BrIf(_) + | Instruction::BrTable { .. } + | Instruction::Return + // Unreachable is a trap (⊥): the acyclic executor diverges the path. + | Instruction::Unreachable => true, + // Back-edges and indirect calls are out of Phase 1 scope. + Instruction::Loop { .. } | Instruction::CallIndirect { .. } => false, + Instruction::Call(g) => { + if depth >= INLINE_ACYCLIC_MAX_DEPTH || *g < num_imported_funcs { + return false; // too deep, or an import (opaque) + } + match all_functions.get((*g - num_imported_funcs) as usize) { + Some(callee) => { + inline_callee_is_noreturn(callee) + || (callee.signature.results.len() <= 1 + && inline_modelable_acyclic_body( + &callee.instructions, + all_functions, + num_imported_funcs, + depth + 1, + )) + } + None => false, + } + } + other => is_acyclic_int_modelable_instr(other), }) } + /// loom#159: is this callee FULLY inline-modelable — i.e. will the + /// translation validator be able to PROVE its inline? Admits EITHER regime A + /// (straight-line incl. memory — by-body modeler) OR regime B (acyclic CF + + /// pure-int + by-body calls — the precise acyclic executor, PR-C #219). A + /// callee outside both stays an opaque call: admitting an unprovable one + /// makes the original (havoc) and optimized (concrete) encodings diverge → + /// FALSE counterexample → the whole caller reverts, dragging down any OTHER + /// inline in it (the v1.1.7 #159 regression). A br_table+memory callee + /// (mutex_unlock, pipe) passes NEITHER regime → stays opaque until Phase 2. + fn function_inline_modelable( + func: &super::Function, + all_functions: &[super::Function], + num_imported_funcs: u32, + ) -> bool { + func.instructions + .iter() + .all(is_straightline_modelable_instr) + || inline_modelable_acyclic_body( + &func.instructions, + all_functions, + num_imported_funcs, + 0, + ) + } + /// Inline function calls in a block of instructions fn inline_calls_in_block( instructions: &[Instruction], diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index 119f19c..eb0703f 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7460,7 +7460,9 @@ fn is_acyclic_modelable_body( Instruction::Br(_) | Instruction::BrIf(_) | Instruction::BrTable { .. } - | Instruction::Return => true, + | Instruction::Return + // Unreachable is a trap (⊥): the executor diverges the path, never havoc. + | Instruction::Unreachable => true, other => is_acyclic_simple_modelable(other), }) } @@ -7684,6 +7686,12 @@ fn exec_acyclic( acyclic_push_branch(labels, 0, state.path.clone(), state)?; state.reachable = false; } + Instruction::Unreachable => { + // Trap: this path diverges (⊥) — it drops out of the result + // ITE and contributes to no join, exactly like a no-return + // call. NEVER havoc (Alive2 ub discipline). + state.reachable = false; + } Instruction::Call(g) => { let callee = ctx .function_bodies @@ -8380,6 +8388,27 @@ mod tests { ); } + /// A direct `unreachable` (trap) on one path must diverge — same ⊥ discipline + /// as a no-return call. The seam decides end their overflow arms in + /// `unreachable`, so the executor must model it (not reject the function). + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_unreachable_diverges() { + let with_trap = r#" + (module (func (param i32) (result i32) + local.get 0 + if (result i32) + i32.const 7 + else + unreachable + end))"#; + let always7 = r#"(module (func (param i32) (result i32) i32.const 7))"#; + assert!( + acyclic_equiv(with_trap, always7).unwrap(), + "the unreachable (trap) path must diverge, leaving only the return-7 path" + ); + } + /// M3: the public verify_acyclic_equivalence fast-path proves a caller that /// `call`s a br_table decide equivalent to the inlined switch, and rejects a /// non-equivalent optimized body — the exact inline verify-or-revert pattern. From dc3222bd739191a4ec9a3503a59e68fee0378865 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 19:22:20 +0200 Subject: [PATCH 13/20] =?UTF-8?q?feat(219):=20PR-C=20Phase=202a=20?= =?UTF-8?q?=E2=80=94=20thread=20memory=20Array=20through=20the=20acyclic?= =?UTF-8?q?=20executor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The acyclic executor now models linear memory (loads/stores), required because real seam callers (z_impl_k_sem_give) read memory for the decide's args. AcyclicState gains `memory: Array`. Memory is MUTABLE branch state, so it is ITE-merged at every join exactly like locals/globals: BranchEntry/MergedState carry the Array, acyclic_merge_entries ITE-merges it (Bool::ite over the Array sort), acyclic_push_branch records it, acyclic_apply_join sets it, and the If arm saves/restores it for the else branch. exec_acyclic handles I32/I64 load/store + partial-width load8/16/store8/16 EXPLICITLY via the shared encode_i32_load_from / encode_i64_load_from / encode_*_store_into / encode_partial_* helpers (the same ones the main encoder + by-body modeler use -> bit-identical memory expressions); they are NEVER delegated to encode_simple_instruction (which havocs). memory is threaded through encode_acyclic_call (shared + propagated like globals) and initialized in encode_acyclic_function as Array::new_const("memory", ...) -- same name as the main encoder so both encodings unify. Gates: new is_acyclic_memory_op (verify.rs) admitted by is_acyclic_modelable_body; lib-side is_acyclic_memory_modelable_instr admitted by the regime-B inline gate. Call gate stays STRICT (no impure-call havoc yet -- that is Phase 2b, the soundness-critical part). So sem_give's z_impl (which also calls k_spin_lock import + z_unpend) is not yet fully modelable; 2a is the contained, testable foundation. Full `cargo test --release --lib` = 410 passed / 0 failed / 2 ignored. New test test_acyclic_memory_merge: store-on-each-branch then load proves equivalent to the direct if/else (validates the per-join memory ITE-merge feeds the loaded value). test_acyclic_modelable_gate updated (memory now admitted). fmt + clippy clean. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 23 ++++- loom-core/src/verify.rs | 218 +++++++++++++++++++++++++++++++++++----- 2 files changed, 213 insertions(+), 28 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 650d21d..6667327 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -13701,6 +13701,25 @@ pub mod optimize { ) } + /// PR-C Phase 2: memory loads/stores the precise acyclic executor models + /// (mirrors `verify::is_acyclic_memory_op`). Admitted by the regime-B inline + /// gate so memory-bearing acyclic callees are inline candidates. + fn is_acyclic_memory_modelable_instr(i: &Instruction) -> bool { + matches!( + i, + Instruction::I32Load { .. } + | Instruction::I64Load { .. } + | Instruction::I32Store { .. } + | Instruction::I64Store { .. } + | Instruction::I32Load8S { .. } + | Instruction::I32Load8U { .. } + | Instruction::I32Load16S { .. } + | Instruction::I32Load16U { .. } + | Instruction::I32Store8 { .. } + | Instruction::I32Store16 { .. } + ) + } + /// loom-side mirror of `verify::is_noreturn_callee` (that one is /// `#[cfg(feature = "verification")]`-gated, unavailable here). True if every /// path traps: no `Return`/`Br*` anywhere (recursing into nested CF) and the @@ -13784,7 +13803,9 @@ pub mod optimize { None => false, } } - other => is_acyclic_int_modelable_instr(other), + other => { + is_acyclic_int_modelable_instr(other) || is_acyclic_memory_modelable_instr(other) + } }) } diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index eb0703f..9b7ba3f 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7274,6 +7274,12 @@ struct AcyclicState { stack: Vec, locals: Vec, globals: Vec, + /// Linear memory as a Z3 `Array[BV32 → BV8]` (PR-C Phase 2). Loads/stores use + /// the SAME shared helpers (`encode_i32_load_from` etc.) the main encoder + /// uses, so a `call F` modeled by-body and the inlined body produce + /// bit-identical memory expressions. Memory is MUTABLE branch state, so it is + /// ITE-merged at every join exactly like locals/globals. + memory: Array, /// Path condition guarding the current straight-line path. path: Bool, /// False once this path has branched / returned (subsequent straight-line @@ -7281,19 +7287,15 @@ struct AcyclicState { reachable: bool, } -/// One control label. Every branch targeting it accumulates an entry; at the -/// label's join point the entries ITE-merge into a single successor state. -/// `entries`: `(guard, result_values, locals_snapshot, globals_snapshot)`. -/// -/// One incoming branch to a label: its path guard plus the result/locals/globals -/// snapshot at the branch point. +/// One incoming branch to a label: its path guard plus the result/locals/globals/ +/// memory snapshot at the branch point. #[cfg(feature = "verification")] -type BranchEntry = (Bool, Vec, Vec, Vec); +type BranchEntry = (Bool, Vec, Vec, Vec, Array); -/// The ITE-merged successor of a join: result values, locals, globals, and the -/// merged path condition. +/// The ITE-merged successor of a join: result values, locals, globals, memory, +/// and the merged path condition. #[cfg(feature = "verification")] -type MergedState = (Vec, Vec, Vec, Bool); +type MergedState = (Vec, Vec, Vec, Array, Bool); #[cfg(feature = "verification")] #[allow(dead_code)] @@ -7397,6 +7399,28 @@ fn is_acyclic_simple_modelable(instr: &Instruction) -> bool { ) } +/// PR-C Phase 2: memory loads/stores `exec_acyclic` models explicitly (via the +/// shared `encode_*_load_from`/`*_store_into` helpers). Kept SEPARATE from +/// `is_acyclic_simple_modelable` because memory ops must NOT be delegated to +/// `encode_simple_instruction` (it havocs them, desyncing the stack). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn is_acyclic_memory_op(instr: &Instruction) -> bool { + matches!( + instr, + Instruction::I32Load { .. } + | Instruction::I64Load { .. } + | Instruction::I32Store { .. } + | Instruction::I64Store { .. } + | Instruction::I32Load8S { .. } + | Instruction::I32Load8U { .. } + | Instruction::I32Load16S { .. } + | Instruction::I32Load16U { .. } + | Instruction::I32Store8 { .. } + | Instruction::I32Store16 { .. } + ) +} + /// Max by-body call-recursion depth the acyclic executor will model. The seam /// decides call only `panic*` (no-return → diverge, no further recursion), so /// depth 2 suffices; 8 gives headroom and bounds any cyclic/recursive call @@ -7463,7 +7487,7 @@ fn is_acyclic_modelable_body( | Instruction::Return // Unreachable is a trap (⊥): the executor diverges the path, never havoc. | Instruction::Unreachable => true, - other => is_acyclic_simple_modelable(other), + other => is_acyclic_simple_modelable(other) || is_acyclic_memory_op(other), }) } @@ -7497,9 +7521,13 @@ fn acyclic_push_branch( return Err(anyhow!("acyclic executor: stack underflow at branch")); } let results: Vec = state.stack[state.stack.len() - arity..].to_vec(); - labels[target] - .entries - .push((guard, results, state.locals.clone(), state.globals.clone())); + labels[target].entries.push(( + guard, + results, + state.locals.clone(), + state.globals.clone(), + state.memory.clone(), + )); Ok(()) } @@ -7511,13 +7539,14 @@ fn acyclic_push_branch( #[cfg(feature = "verification")] #[allow(dead_code)] fn acyclic_merge_entries(entries: &[BranchEntry]) -> Option { - let (_, last_res, last_loc, last_glob) = entries.last()?; + let (_, last_res, last_loc, last_glob, last_mem) = entries.last()?; let mut res = last_res.clone(); let mut loc = last_loc.clone(); let mut glob = last_glob.clone(); + let mut mem = last_mem.clone(); let mut guards: Vec = vec![entries.last().unwrap().0.clone()]; // Fold earlier entries outward: each takes precedence under its own guard. - for (g, r, l, gl) in entries[..entries.len() - 1].iter().rev() { + for (g, r, l, gl, m) in entries[..entries.len() - 1].iter().rev() { res = r .iter() .zip(res.iter()) @@ -7533,10 +7562,12 @@ fn acyclic_merge_entries(entries: &[BranchEntry]) -> Option { .zip(glob.iter()) .map(|(t, f)| merge_bv(g, t, f)) .collect(); + // ITE-merge the memory Array (Bool::ite is generic over the Z3 sort). + mem = g.ite(m, &mem); guards.push(g.clone()); } let path = Bool::or(&guards); - Some((res, loc, glob, path)) + Some((res, loc, glob, mem, path)) } /// Apply a popped label's join to the state: the post-block state is the @@ -7555,12 +7586,13 @@ fn acyclic_apply_join( state.reachable = false; Ok(()) } - Some((results, locals, globals, path)) => { + Some((results, locals, globals, memory, path)) => { let mut stack = base_stack; stack.extend(results); state.stack = stack; state.locals = locals; state.globals = globals; + state.memory = memory; state.path = path; state.reachable = true; Ok(()) @@ -7616,6 +7648,7 @@ fn exec_acyclic( let base_stack = state.stack.clone(); let saved_locals = state.locals.clone(); let saved_globals = state.globals.clone(); + let saved_memory = state.memory.clone(); let saved_path = state.path.clone(); labels.push(LabelJoin { arity, @@ -7633,6 +7666,7 @@ fn exec_acyclic( state.stack = base_stack.clone(); state.locals = saved_locals; state.globals = saved_globals; + state.memory = saved_memory; state.path = Bool::and(&[saved_path, cond_nz.not()]); state.reachable = true; exec_acyclic(else_body, state, labels, ctx, depth)?; @@ -7709,6 +7743,91 @@ fn exec_acyclic( encode_acyclic_call(callee, state, ctx, depth)?; } } + // PR-C Phase 2: memory loads/stores via the SAME shared helpers the + // main encoder uses (so a by-body call and its inlined body produce + // bit-identical memory expressions). Handled explicitly — NEVER + // delegated to encode_simple_instruction, which havocs memory ops. + Instruction::I32Load { offset, .. } => { + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I32Load underflow"))?; + state + .stack + .push(encode_i32_load_from(&state.memory, &addr, *offset)?); + } + Instruction::I64Load { offset, .. } => { + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I64Load underflow"))?; + state + .stack + .push(encode_i64_load_from(&state.memory, &addr, *offset)?); + } + Instruction::I32Store { offset, .. } => { + let value = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I32Store value underflow"))?; + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I32Store addr underflow"))?; + state.memory = encode_i32_store_into(&state.memory, &addr, &value, *offset); + } + Instruction::I64Store { offset, .. } => { + let value = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I64Store value underflow"))?; + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: I64Store addr underflow"))?; + state.memory = encode_i64_store_into(&state.memory, &addr, &value, *offset); + } + // Partial-width (8/16-bit) loads/stores (loom#161 helpers). + Instruction::I32Load8S { offset, .. } + | Instruction::I32Load8U { offset, .. } + | Instruction::I32Load16S { offset, .. } + | Instruction::I32Load16U { offset, .. } => { + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: partial load underflow"))?; + let (bytes, signed) = match instr { + Instruction::I32Load8S { .. } => (1, true), + Instruction::I32Load8U { .. } => (1, false), + Instruction::I32Load16S { .. } => (2, true), + _ => (2, false), + }; + state.stack.push(encode_partial_load_from( + &state.memory, + &addr, + *offset, + bytes, + signed, + 32, + )?); + } + Instruction::I32Store8 { offset, .. } | Instruction::I32Store16 { offset, .. } => { + let value = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: partial store value underflow"))?; + let addr = state + .stack + .pop() + .ok_or_else(|| anyhow!("acyclic executor: partial store addr underflow"))?; + let bytes = if matches!(instr, Instruction::I32Store8 { .. }) { + 1 + } else { + 2 + }; + state.memory = + encode_partial_store_into(&state.memory, &addr, &value, *offset, bytes); + } other => { // Defense in depth: never delegate an unmodeled op (the // delegate's catch-all havocs and would desync the stack). @@ -7767,9 +7886,11 @@ fn encode_acyclic_call( let mut sub = AcyclicState { stack: Vec::new(), locals: callee_locals, - // SHARED globals — the callee may read/write module globals; the merged - // result threads the updated globals back to the caller. + // SHARED globals + memory — the callee may read/write module globals and + // linear memory; the merged result threads the updated state back to the + // caller. globals: state.globals.clone(), + memory: state.memory.clone(), // Inherit the caller's path so callee branch guards conjoin it. path: state.path.clone(), reachable: true, @@ -7790,10 +7911,11 @@ fn encode_acyclic_call( } let join = sub_labels.pop().unwrap(); match acyclic_merge_entries(&join.entries) { - Some((results, _locals, globals, path)) => { - // Propagate callee global writes; narrow the caller path to the - // callee's returning paths. + Some((results, _locals, globals, memory, path)) => { + // Propagate callee global + memory writes; narrow the caller path to + // the callee's returning paths. state.globals = globals; + state.memory = memory; state.path = path; if arity == 1 { state.stack.push(results[0].clone()); @@ -7842,11 +7964,15 @@ fn encode_acyclic_function( for i in 0..16 { globals.push(BV::new_const(format!("global{}", i), 32)); } + // Symbolic initial memory — same name as the main encoder so two + // encode_acyclic_function calls (original + optimized) unify in Z3. + let memory = Array::new_const("memory", &Sort::bitvector(32), &Sort::bitvector(8)); let mut state = AcyclicState { stack: Vec::new(), locals, globals, + memory, path: Bool::from_bool(true), reachable: true, }; @@ -7865,7 +7991,7 @@ fn encode_acyclic_function( return Ok(None); } match acyclic_merge_entries(&func_join.entries) { - Some((results, _, _, _)) => Ok(Some(results[0].clone())), + Some((results, _, _, _, _)) => Ok(Some(results[0].clone())), None => Err(anyhow!("acyclic executor: function has no reachable exit")), } } @@ -8302,7 +8428,7 @@ mod tests { #[test] #[cfg(feature = "verification")] fn test_acyclic_modelable_gate() { - // The whole-function gate must reject loops and memory ops. + // The whole-function gate must reject loops (back-edges). let with_loop = parse::parse_wat("(module (func (param i32) (result i32) (loop local.get 0)))") .unwrap(); @@ -8312,14 +8438,16 @@ mod tests { "loops (back-edges) must be rejected" ); + // PR-C Phase 2: memory ops are now ADMITTED (the executor threads a + // memory Array). (Phase 1 rejected them.) let with_memory = parse::parse_wat( "(module (memory 1) (func (param i32) (result i32) local.get 0 i32.load))", ) .unwrap(); let ctx_mem = VerificationSignatureContext::from_module(&with_memory); assert!( - !is_acyclic_modelable_body(&with_memory.functions[0].instructions, &ctx_mem, 0), - "memory ops must be rejected in Phase 1" + is_acyclic_modelable_body(&with_memory.functions[0].instructions, &ctx_mem, 0), + "memory ops are modelable in Phase 2" ); // A pure-integer br_table switch must be admitted. @@ -8409,6 +8537,42 @@ mod tests { ); } + /// PR-C Phase 2: a store on each branch followed by a load must MERGE memory + /// correctly (the load sees ite(cond, mem_then, mem_else)). Equivalent to the + /// direct if/else — proves the per-join memory ITE-merge is wired right. + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_memory_merge() { + let with_mem = r#" + (module (memory 1) + (func (param i32) (result i32) + local.get 0 + if + i32.const 100 + i32.const 5 + i32.store + else + i32.const 100 + i32.const 6 + i32.store + end + i32.const 100 + i32.load))"#; + let direct = r#" + (module (memory 1) + (func (param i32) (result i32) + local.get 0 + if (result i32) + i32.const 5 + else + i32.const 6 + end))"#; + assert!( + acyclic_equiv(with_mem, direct).unwrap(), + "store-on-each-branch then load must merge memory (ite) → loaded value matches" + ); + } + /// M3: the public verify_acyclic_equivalence fast-path proves a caller that /// `call`s a br_table decide equivalent to the inlined switch, and rejects a /// non-equivalent optimized body — the exact inline verify-or-revert pattern. From 01f3042ab7a4a4b9604c2e687b1cad67f8ba2971 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 17 Jun 2026 20:54:55 +0200 Subject: [PATCH 14/20] =?UTF-8?q?feat(219):=20PR-C=20Phase=202b=20?= =?UTF-8?q?=E2=80=94=20impure-call=20havoc=20in=20the=20acyclic=20executor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The acyclic executor now models impure calls (imports + non-acyclic-modelable local callees) by HAVOC, so memory-bearing seam callers like z_impl_k_sem_give (which call k_spin_lock + z_unpend) become verifier-modelable end-to-end. exec_acyclic's Call arm is now 3-tier (mirrors the main encoder): no-return -> diverge; acyclic single-result local -> by-body (encode_acyclic_call); else (import / multi-result / too-deep / loop-containing) -> acyclic_havoc_call: pop args, push fresh `call_{g}_result_{i}`, havoc all globals + memory with a SHARED per-encode counter. The counter is threaded (&mut u64) through exec_acyclic + encode_acyclic_call and reset per encode_acyclic_function -- so the Nth impure call gets the SAME havoc id in the original and optimized encodings -> they unify in Z3. Havoc is conservative-sound: it never forges a value equal to a concrete one, so a havoc-vs-concrete mismatch refuses to prove (the #155/#159 guard). is_acyclic_modelable_body is now purely STRUCTURAL (drops ctx/depth params -- any direct call is modelable since havoc covers the opaque case; clippy only_used_in_recursion). The lib candidate gate is unchanged (decide already admitted via its no-return panic call). Full `cargo test --release --lib` = 413 passed / 0 failed / 2 ignored. New tests: test_acyclic_havoc_call_unifies (havoc id parity); test_acyclic_havoc_not_concrete (havoc'd result NOT provably equal to a concrete value -- soundness); test_acyclic_havoc_memory (load after impure call reads havoc'd memory, not the initial -- soundness). fmt + clippy clean. Next: e2e dissolve on repro-219/sem.loom.wasm (decide's inline into z_impl should now be PROVEN by the acyclic path -> no `call $gale_k_sem_give_decide`). Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/verify.rs | 255 ++++++++++++++++++++++++++++++---------- 1 file changed, 192 insertions(+), 63 deletions(-) diff --git a/loom-core/src/verify.rs b/loom-core/src/verify.rs index 9b7ba3f..4548eb6 100644 --- a/loom-core/src/verify.rs +++ b/loom-core/src/verify.rs @@ -7429,48 +7429,28 @@ fn is_acyclic_memory_op(instr: &Instruction) -> bool { const MAX_ACYCLIC_CALL_DEPTH: usize = 8; /// Whole-function gate: true iff every instruction (recursing into block/if -/// bodies AND into by-body call targets) is a modeled control op, an allowlisted -/// simple op, or an admissible `Call`. Back-edges (Loop), call_indirect, memory, -/// unknown opcodes, unmodeled block shapes, imports, and over-deep/cyclic call -/// graphs all fail. A `Call(g)` is admissible iff `g` is a local function that -/// is itself no-return (diverges → modeled as ⊥) or recursively acyclic-modelable. +/// bodies) is structurally modelable by `exec_acyclic`. Back-edges (Loop), +/// call_indirect, unknown opcodes, and unmodeled block shapes (params/multi- +/// result) fail. As of Phase 2b the gate is purely STRUCTURAL — it needs no +/// module context, because ANY direct call is modelable (the executor diverges +/// no-return callees, models acyclic locals by-body, and HAVOCS the rest — +/// havoc is conservative-sound), and memory ops are handled explicitly. /// -/// This is a fast pre-check for the M3 routing decision; `exec_acyclic` is the -/// actual soundness enforcer (it returns `Err` on anything it cannot model -/// precisely, never havoc). +/// A fast pre-check for the M3 routing decision; `exec_acyclic` is the actual +/// soundness enforcer (it returns `Err` on anything it cannot model, never havoc +/// a value-producing op silently). #[cfg(feature = "verification")] #[allow(dead_code)] -fn is_acyclic_modelable_body( - body: &[Instruction], - ctx: &VerificationSignatureContext, - depth: usize, -) -> bool { +fn is_acyclic_modelable_body(body: &[Instruction]) -> bool { body.iter().all(|instr| match instr { - // Back-edges and indirect calls are out of Phase 1 scope. + // Back-edges and indirect calls are out of scope. Instruction::Loop { .. } | Instruction::CallIndirect { .. } => false, - Instruction::Call(g) => { - if depth >= MAX_ACYCLIC_CALL_DEPTH { - return false; - } - match ctx - .function_bodies - .get(*g as usize) - .and_then(|o| o.as_deref()) - { - // No-return callee (panic→unreachable): modeled as ⊥ diverge. - Some(callee) if is_noreturn_callee(callee) => true, - // Otherwise the callee must itself be acyclic-modelable, and - // single-result (the executor pushes at most one result value). - Some(callee) => { - callee.signature.results.len() <= 1 - && is_acyclic_modelable_body(&callee.instructions, ctx, depth + 1) - } - // Import (None) or out-of-range: opaque, not modelable. - None => false, - } - } + // ANY direct call is modelable: by-body (acyclic local), ⊥-diverge + // (no-return), or HAVOC (import / loop-containing / multi-result). The + // executor's Call arm makes the choice; havoc never forges a false ≡. + Instruction::Call(_) => true, Instruction::Block { block_type, body } => { - acyclic_block_arity(block_type).is_ok() && is_acyclic_modelable_body(body, ctx, depth) + acyclic_block_arity(block_type).is_ok() && is_acyclic_modelable_body(body) } Instruction::If { block_type, @@ -7478,8 +7458,8 @@ fn is_acyclic_modelable_body( else_body, } => { acyclic_block_arity(block_type).is_ok() - && is_acyclic_modelable_body(then_body, ctx, depth) - && is_acyclic_modelable_body(else_body, ctx, depth) + && is_acyclic_modelable_body(then_body) + && is_acyclic_modelable_body(else_body) } Instruction::Br(_) | Instruction::BrIf(_) @@ -7611,6 +7591,10 @@ fn exec_acyclic( labels: &mut Vec, ctx: &VerificationSignatureContext, depth: usize, + // Monotonic per-ENCODE counter for impure-call havoc names. Threaded (not in + // AcyclicState, which is saved/restored at branches) so the Nth impure call + // gets the same id in the original and optimized encodings → they unify. + havoc_counter: &mut u64, ) -> Result<()> { for instr in body { if !state.reachable { @@ -7625,7 +7609,7 @@ fn exec_acyclic( arity, entries: Vec::new(), }); - exec_acyclic(body, state, labels, ctx, depth)?; + exec_acyclic(body, state, labels, ctx, depth, havoc_counter)?; // Fall-through off the end == implicit `br 0` to this block. if state.reachable { let target = labels.len() - 1; @@ -7658,7 +7642,7 @@ fn exec_acyclic( // then-branch under path ∧ cond≠0 state.path = Bool::and(&[saved_path.clone(), cond_nz.clone()]); state.reachable = true; - exec_acyclic(then_body, state, labels, ctx, depth)?; + exec_acyclic(then_body, state, labels, ctx, depth, havoc_counter)?; if state.reachable { acyclic_push_branch(labels, target, state.path.clone(), state)?; } @@ -7669,7 +7653,7 @@ fn exec_acyclic( state.memory = saved_memory; state.path = Bool::and(&[saved_path, cond_nz.not()]); state.reachable = true; - exec_acyclic(else_body, state, labels, ctx, depth)?; + exec_acyclic(else_body, state, labels, ctx, depth, havoc_counter)?; if state.reachable { acyclic_push_branch(labels, target, state.path.clone(), state)?; } @@ -7727,20 +7711,38 @@ fn exec_acyclic( state.reachable = false; } Instruction::Call(g) => { - let callee = ctx + let callee_opt = ctx .function_bodies .get(*g as usize) - .and_then(|o| o.as_deref()) - .ok_or_else(|| { - anyhow!("acyclic executor: call to import/unknown func {}", g) - })?; - if is_noreturn_callee(callee) { - // ⊥-diverge: control never returns (panic→unreachable). The - // path traps and drops out of the result ITE — NEVER havoc - // (Alive2 `ub` discipline; havoc reopens #155/#159). - state.reachable = false; - } else { - encode_acyclic_call(callee, state, ctx, depth)?; + .and_then(|o| o.as_deref()); + match callee_opt { + // No-return callee (panic→unreachable): ⊥-diverge — the path + // traps and drops out of the result ITE, NEVER havoc (Alive2 + // `ub` discipline; havoc reopens #155/#159). + Some(callee) if is_noreturn_callee(callee) => { + state.reachable = false; + } + // Acyclic, single-result local callee: model BY BODY (the + // same executor models the original `call F` and the inlined + // body identically → the inline is provable). Its OWN impure + // calls get havoc'd via the shared counter. + Some(callee) + if depth + 1 < MAX_ACYCLIC_CALL_DEPTH + && callee.signature.results.len() <= 1 + && is_acyclic_modelable_body(&callee.instructions) => + { + encode_acyclic_call(callee, state, ctx, depth, havoc_counter)?; + } + // Impure / opaque call (import, multi-result, too-deep, or a + // local callee with a loop/unmodelable shape): HAVOC. Sound + // and conservative — corresponding calls in the original and + // optimized encodings get the SAME havoc id (shared counter) + // so they unify; only a havoc-vs-CONCRETE mismatch (e.g. + // original havocs `call F` but optimized inlines F's effect) + // refuses to prove → revert (the #155/#159 guard). + _ => { + acyclic_havoc_call(*g, state, ctx, havoc_counter)?; + } } } // PR-C Phase 2: memory loads/stores via the SAME shared helpers the @@ -7849,6 +7851,55 @@ fn exec_acyclic( Ok(()) } +/// Model an opaque / impure `call g` by HAVOC (PR-C Phase 2b): pop the args, +/// push a fresh symbolic per result, and havoc all globals + memory. Naming is +/// deterministic in the per-encode `havoc_counter` so the Nth impure call gets +/// the SAME names in the original and optimized encodings → they unify in Z3 +/// (the original's `call F` and the optimized's `call F` cancel). Mirrors the +/// main encoder's conservative Call tier. Used for imports and local callees the +/// acyclic executor cannot model by-body (loops, multi-result, too-deep). +#[cfg(feature = "verification")] +#[allow(dead_code)] +fn acyclic_havoc_call( + g: u32, + state: &mut AcyclicState, + ctx: &VerificationSignatureContext, + havoc_counter: &mut u64, +) -> Result<()> { + let sig = ctx + .get_function_signature(g) + .ok_or_else(|| anyhow!("acyclic executor: no signature for call {}", g))?; + let nparams = sig.params.len(); + if state.stack.len() < nparams { + return Err(anyhow!("acyclic executor: havoc-call arg underflow")); + } + // Discard the args (the opaque result is independent of them, matching the + // main encoder's conservative tier). + state.stack.truncate(state.stack.len() - nparams); + // Fresh symbolic result(s). + for (i, rt) in sig.results.iter().enumerate() { + state.stack.push(BV::new_const( + format!("call_{}_result_{}", g, i), + bv_width_for_value_type(*rt), + )); + } + // Havoc globals + memory with the shared deterministic id. + let id = *havoc_counter; + *havoc_counter += 1; + for (idx, global) in state.globals.iter_mut().enumerate() { + *global = BV::new_const( + format!("global_{}_after_call_{}_{}", idx, g, id), + global.get_size(), + ); + } + state.memory = Array::new_const( + format!("mem_after_call_{}_{}", g, id), + &Sort::bitvector(32), + &Sort::bitvector(8), + ); + Ok(()) +} + /// Model a `call` to `callee` BY BODY: pop the args, recursively encode the /// callee through `exec_acyclic` (params bound to the args, declared locals /// zero-init, globals SHARED with the caller so the callee's global writes @@ -7863,6 +7914,7 @@ fn encode_acyclic_call( state: &mut AcyclicState, ctx: &VerificationSignatureContext, depth: usize, + havoc_counter: &mut u64, ) -> Result<()> { if depth + 1 >= MAX_ACYCLIC_CALL_DEPTH { return Err(anyhow!("acyclic executor: call recursion too deep")); @@ -7905,6 +7957,7 @@ fn encode_acyclic_call( &mut sub_labels, ctx, depth + 1, + havoc_counter, )?; if sub.reachable { acyclic_push_branch(&mut sub_labels, 0, sub.path.clone(), &sub)?; @@ -7939,7 +7992,7 @@ fn encode_acyclic_function( func: &Function, ctx: &VerificationSignatureContext, ) -> Result> { - if !is_acyclic_modelable_body(&func.instructions, ctx, 0) { + if !is_acyclic_modelable_body(&func.instructions) { return Err(anyhow!("function not acyclic-modelable")); } let arity = func.signature.results.len(); @@ -7980,7 +8033,17 @@ fn encode_acyclic_function( arity, entries: Vec::new(), }]; - exec_acyclic(&func.instructions, &mut state, &mut labels, ctx, 0)?; + // Per-encode impure-call havoc counter (reset to 0 each encode so the Nth + // impure call gets the same id in the original and optimized encodings). + let mut havoc_counter: u64 = 0; + exec_acyclic( + &func.instructions, + &mut state, + &mut labels, + ctx, + 0, + &mut havoc_counter, + )?; // Fall-through off the function body == implicit return. if state.reachable { acyclic_push_branch(&mut labels, 0, state.path.clone(), &state)?; @@ -8022,8 +8085,8 @@ pub(crate) fn verify_acyclic_equivalence( return Err(anyhow!("acyclic verify: signature changed")); } // Only handle the precise acyclic case; anything else → caller falls back. - if !is_acyclic_modelable_body(&original.instructions, ctx, 0) - || !is_acyclic_modelable_body(&optimized.instructions, ctx, 0) + if !is_acyclic_modelable_body(&original.instructions) + || !is_acyclic_modelable_body(&optimized.instructions) { return Err(anyhow!("acyclic verify: not acyclic-modelable")); } @@ -8432,9 +8495,8 @@ mod tests { let with_loop = parse::parse_wat("(module (func (param i32) (result i32) (loop local.get 0)))") .unwrap(); - let ctx_loop = VerificationSignatureContext::from_module(&with_loop); assert!( - !is_acyclic_modelable_body(&with_loop.functions[0].instructions, &ctx_loop, 0), + !is_acyclic_modelable_body(&with_loop.functions[0].instructions), "loops (back-edges) must be rejected" ); @@ -8444,17 +8506,15 @@ mod tests { "(module (memory 1) (func (param i32) (result i32) local.get 0 i32.load))", ) .unwrap(); - let ctx_mem = VerificationSignatureContext::from_module(&with_memory); assert!( - is_acyclic_modelable_body(&with_memory.functions[0].instructions, &ctx_mem, 0), + is_acyclic_modelable_body(&with_memory.functions[0].instructions), "memory ops are modelable in Phase 2" ); // A pure-integer br_table switch must be admitted. let switch = parse::parse_wat(SWITCH_10_20_30).unwrap(); - let ctx_sw = VerificationSignatureContext::from_module(&switch); assert!( - is_acyclic_modelable_body(&switch.functions[0].instructions, &ctx_sw, 0), + is_acyclic_modelable_body(&switch.functions[0].instructions), "pure-integer acyclic br_table function must be admitted" ); } @@ -8573,6 +8633,75 @@ mod tests { ); } + /// PR-C Phase 2b: a call to an impure import is HAVOC'd. Two encodings of the + /// same impure call must UNIFY (same per-encode havoc id) → equivalent. (If + /// the counter weren't deterministic per-encode, the two `call_0_result_0` + /// names would differ and this would spuriously fail.) + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_havoc_call_unifies() { + let calls_import = r#" + (module + (import "env" "f" (func $f (param i32) (result i32))) + (func (param i32) (result i32) + local.get 0 + call $f))"#; + assert!( + acyclic_equiv(calls_import, calls_import).unwrap(), + "two encodings of the same impure call must unify (havoc id parity)" + ); + } + + /// PR-C Phase 2b SOUNDNESS: a havoc'd call result must NOT be provably equal + /// to a concrete value — `call $f(x)` (opaque) is not `x`. This is the guard + /// against a havoc-vs-concrete false equivalence (#155/#159 surface). + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_havoc_not_concrete() { + let calls_import = r#" + (module + (import "env" "f" (func $f (param i32) (result i32))) + (func (param i32) (result i32) + local.get 0 + call $f))"#; + let identity = r#" + (module + (import "env" "f" (func $f (param i32) (result i32))) + (func (param i32) (result i32) + local.get 0))"#; + assert!( + !acyclic_equiv(calls_import, identity).unwrap(), + "a havoc'd call result must NOT prove equal to a concrete value" + ); + } + + /// PR-C Phase 2b SOUNDNESS: an impure call havocs memory, so a load AFTER the + /// call reads the havoc'd memory — NOT provably equal to loading the initial + /// memory. Guards against the havoc failing to invalidate memory. + #[test] + #[cfg(feature = "verification")] + fn test_acyclic_havoc_memory() { + let load_after_call = r#" + (module + (import "env" "g" (func $g)) + (memory 1) + (func (result i32) + call $g + i32.const 0 + i32.load))"#; + let load_initial = r#" + (module + (import "env" "g" (func $g)) + (memory 1) + (func (result i32) + i32.const 0 + i32.load))"#; + assert!( + !acyclic_equiv(load_after_call, load_initial).unwrap(), + "a load after an impure call must read havoc'd memory, not the initial" + ); + } + /// M3: the public verify_acyclic_equivalence fast-path proves a caller that /// `call`s a br_table decide equivalent to the inlined switch, and rejects a /// non-equivalent optimized body — the exact inline verify-or-revert pattern. From 186d2361eec676a6a2f4a648c65fe4d16de1e542 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Thu, 18 Jun 2026 08:11:33 +0200 Subject: [PATCH 15/20] feat(219): carrier scalar-forwarding mechanism in rewrite_with_dataflow (dormant) Foundation for the #219 perf milestone (dissolve the u64 ABI pack/unpack the proven inline leaves in z_impl). Extends rewrite_with_dataflow to forward a SINGLE-ASSIGNMENT local's pure defining expression to its uses (carrier mem2reg), gated by a caller-supplied `OptimizationEnv.single_assign` set -- EMPTY by default, so this is DORMANT and behavior is byte-identical to before (confirmed: full loom-core lib suite 413 passed / 0 failed; loom-shared clippy clean). Soundness of the mechanism (the loom-core pass that seeds single_assign lands next, and only seeds TOP-LEVEL single-assignment locals so the def dominates all uses): - is_forwardable_expr: pure-op whitelist (const/local/global/int arith-bitwise-shift-rotate-compare/converts/select); calls, loads/stores, div/rem, floats -> not forwardable. - reaching-defs guard: invalidate_pins_referencing drops a pinned forwarding whenever an input local is reassigned (in-traversal for straight-line/Block/ Loop; the If arm intersects both forks' pins so a branch that reassigns an input drops the pin). So a forwarded expression always carries its def-time inputs. - pinned survives CF clear() (sound for single-assignment: one value on every path reaching a use). This transform is NOT Z3-backstoppable for the seam (void z_impl + value flows only into a havoc'd impure-call arg -> vacuous pass); gale's G474RE silicon is the behavioral gate (per #219 -- gale will flash inline-only vs inline+teardown). Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-shared/src/lib.rs | 201 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index 2cce36e..ee26a9b 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -2712,6 +2712,18 @@ pub struct OptimizationEnv { pub locals: std::collections::HashMap, /// Memory state: location → stored value pub memory: std::collections::HashMap, + /// #219 carrier scalar-forwarding: indices of SINGLE-ASSIGNMENT locals + /// (exactly one write in the whole function, precomputed by the caller) whose + /// pure defining expression may be forwarded to its uses. Empty by default → + /// forwarding OFF, behavior identical to plain dataflow. + pub single_assign: std::collections::HashSet, + /// #219: captured defining expression for each `single_assign` local, recorded + /// at its (single) `local.set`/`local.tee`. SURVIVES control-flow `clear()` + /// (sound: a single-assignment local has one value on every path that reaches + /// a use). INVALIDATED when any input local of the expression is reassigned + /// (reaching-defs guard), so a forwarded expression always carries its + /// def-time inputs. `local.get` of such a local forwards to this expression. + pub pinned: std::collections::HashMap, } impl Default for OptimizationEnv { @@ -2725,6 +2737,8 @@ impl OptimizationEnv { OptimizationEnv { locals: std::collections::HashMap::new(), memory: std::collections::HashMap::new(), + single_assign: std::collections::HashSet::new(), + pinned: std::collections::HashMap::new(), } } @@ -2737,6 +2751,162 @@ impl OptimizationEnv { /// Legacy type alias for compatibility pub type LocalEnv = OptimizationEnv; +/// #219 carrier scalar-forwarding: is `v` a side-effect-free, re-evaluatable +/// value expression safe to FORWARD to a single-assignment local's use sites? +/// Whitelist of total pure ops (constants, locals, globals, integer +/// arithmetic/bitwise/shift/rotate/compare, width converts, select); recurses +/// into operands. Calls, ANY load/store, div/rem (trap), floats, blocks/branches +/// → `false`. Bounded to the listed variants (NOT a full 187-variant walk): +/// unknown/impure → `false` (conservative, sound). +#[allow(clippy::match_like_matches_macro)] +fn is_forwardable_expr(v: &Value) -> bool { + match v.data() { + ValueData::I32Const { .. } + | ValueData::I64Const { .. } + | ValueData::LocalGet { .. } + | ValueData::GlobalGet { .. } => true, + ValueData::I32WrapI64 { val } + | ValueData::I64ExtendI32S { val } + | ValueData::I64ExtendI32U { val } + | ValueData::I32Eqz { val } + | ValueData::I64Eqz { val } => is_forwardable_expr(val), + ValueData::I32Add { lhs, rhs } + | ValueData::I32Sub { lhs, rhs } + | ValueData::I32Mul { lhs, rhs } + | ValueData::I32And { lhs, rhs } + | ValueData::I32Or { lhs, rhs } + | ValueData::I32Xor { lhs, rhs } + | ValueData::I32Shl { lhs, rhs } + | ValueData::I32ShrS { lhs, rhs } + | ValueData::I32ShrU { lhs, rhs } + | ValueData::I32Rotl { lhs, rhs } + | ValueData::I32Rotr { lhs, rhs } + | ValueData::I32Eq { lhs, rhs } + | ValueData::I32Ne { lhs, rhs } + | ValueData::I32LtS { lhs, rhs } + | ValueData::I32LtU { lhs, rhs } + | ValueData::I32GtS { lhs, rhs } + | ValueData::I32GtU { lhs, rhs } + | ValueData::I32LeS { lhs, rhs } + | ValueData::I32LeU { lhs, rhs } + | ValueData::I32GeS { lhs, rhs } + | ValueData::I32GeU { lhs, rhs } + | ValueData::I64Add { lhs, rhs } + | ValueData::I64Sub { lhs, rhs } + | ValueData::I64Mul { lhs, rhs } + | ValueData::I64And { lhs, rhs } + | ValueData::I64Or { lhs, rhs } + | ValueData::I64Xor { lhs, rhs } + | ValueData::I64Shl { lhs, rhs } + | ValueData::I64ShrS { lhs, rhs } + | ValueData::I64ShrU { lhs, rhs } + | ValueData::I64Rotl { lhs, rhs } + | ValueData::I64Rotr { lhs, rhs } + | ValueData::I64Eq { lhs, rhs } + | ValueData::I64Ne { lhs, rhs } + | ValueData::I64LtS { lhs, rhs } + | ValueData::I64LtU { lhs, rhs } + | ValueData::I64GtS { lhs, rhs } + | ValueData::I64GtU { lhs, rhs } + | ValueData::I64LeS { lhs, rhs } + | ValueData::I64LeU { lhs, rhs } + | ValueData::I64GeS { lhs, rhs } + | ValueData::I64GeU { lhs, rhs } => is_forwardable_expr(lhs) && is_forwardable_expr(rhs), + ValueData::Select { + cond, + true_val, + false_val, + } => { + is_forwardable_expr(cond) + && is_forwardable_expr(true_val) + && is_forwardable_expr(false_val) + } + _ => false, + } +} + +/// #219: does `v` (a forwardable expr — same whitelist as `is_forwardable_expr`) +/// reference `local.get local_idx`? Used to INVALIDATE a pinned forwarding when +/// one of its input locals is reassigned (reaching-defs guard), so a forwarded +/// expression never picks up a later redefinition of an input. +fn expr_references_local(v: &Value, local_idx: u32) -> bool { + match v.data() { + ValueData::LocalGet { idx } => *idx == local_idx, + ValueData::I32Const { .. } | ValueData::I64Const { .. } | ValueData::GlobalGet { .. } => { + false + } + ValueData::I32WrapI64 { val } + | ValueData::I64ExtendI32S { val } + | ValueData::I64ExtendI32U { val } + | ValueData::I32Eqz { val } + | ValueData::I64Eqz { val } => expr_references_local(val, local_idx), + ValueData::I32Add { lhs, rhs } + | ValueData::I32Sub { lhs, rhs } + | ValueData::I32Mul { lhs, rhs } + | ValueData::I32And { lhs, rhs } + | ValueData::I32Or { lhs, rhs } + | ValueData::I32Xor { lhs, rhs } + | ValueData::I32Shl { lhs, rhs } + | ValueData::I32ShrS { lhs, rhs } + | ValueData::I32ShrU { lhs, rhs } + | ValueData::I32Rotl { lhs, rhs } + | ValueData::I32Rotr { lhs, rhs } + | ValueData::I32Eq { lhs, rhs } + | ValueData::I32Ne { lhs, rhs } + | ValueData::I32LtS { lhs, rhs } + | ValueData::I32LtU { lhs, rhs } + | ValueData::I32GtS { lhs, rhs } + | ValueData::I32GtU { lhs, rhs } + | ValueData::I32LeS { lhs, rhs } + | ValueData::I32LeU { lhs, rhs } + | ValueData::I32GeS { lhs, rhs } + | ValueData::I32GeU { lhs, rhs } + | ValueData::I64Add { lhs, rhs } + | ValueData::I64Sub { lhs, rhs } + | ValueData::I64Mul { lhs, rhs } + | ValueData::I64And { lhs, rhs } + | ValueData::I64Or { lhs, rhs } + | ValueData::I64Xor { lhs, rhs } + | ValueData::I64Shl { lhs, rhs } + | ValueData::I64ShrS { lhs, rhs } + | ValueData::I64ShrU { lhs, rhs } + | ValueData::I64Rotl { lhs, rhs } + | ValueData::I64Rotr { lhs, rhs } + | ValueData::I64Eq { lhs, rhs } + | ValueData::I64Ne { lhs, rhs } + | ValueData::I64LtS { lhs, rhs } + | ValueData::I64LtU { lhs, rhs } + | ValueData::I64GtS { lhs, rhs } + | ValueData::I64GtU { lhs, rhs } + | ValueData::I64LeS { lhs, rhs } + | ValueData::I64LeU { lhs, rhs } + | ValueData::I64GeS { lhs, rhs } + | ValueData::I64GeU { lhs, rhs } => { + expr_references_local(lhs, local_idx) || expr_references_local(rhs, local_idx) + } + ValueData::Select { + cond, + true_val, + false_val, + } => { + expr_references_local(cond, local_idx) + || expr_references_local(true_val, local_idx) + || expr_references_local(false_val, local_idx) + } + // A pinned expr only ever holds is_forwardable_expr shapes; anything else + // conservatively "might reference" → invalidate. + _ => true, + } +} + +/// #219: a local `idx` was just (re)assigned — drop any pinned forwarding whose +/// expression references it, so a forwarded expression never captures a stale or +/// post-redefinition input value. +fn invalidate_pins_referencing(env: &mut OptimizationEnv, idx: u32) { + env.pinned + .retain(|_, expr| !expr_references_local(expr, idx)); +} + /// Dataflow-aware ISLE rewrite — tracks local variables and memory state. /// /// Applies all pure structural rewrites (constant folding, algebraic @@ -2747,6 +2917,10 @@ pub type LocalEnv = OptimizationEnv; /// /// Only safe for straight-line code or with env clearing at join points. /// For functions with BrIf/BrTable, use `rewrite_pure` instead. +/// +/// #219: when `env.single_assign` is non-empty, also forwards those +/// single-assignment locals' pure defining expressions (carrier scalar- +/// forwarding), with a reaching-defs invalidation guard. pub fn rewrite_with_dataflow(val: Value, env: &mut OptimizationEnv) -> Value { match val.data() { // Local variable operations @@ -2763,6 +2937,15 @@ pub fn rewrite_with_dataflow(val: Value, env: &mut OptimizationEnv) -> Value { env.locals.remove(idx); } + // #219 reaching-defs guard: reassigning `idx` invalidates any pinned + // forwarding that references it (so a forwarded expr never captures a + // post-redefinition input). Then, if `idx` is the single-assignment + // carrier with a forwardable RHS, pin its defining expression. + invalidate_pins_referencing(env, *idx); + if env.single_assign.contains(idx) && is_forwardable_expr(&simplified_val) { + env.pinned.insert(*idx, simplified_val.clone()); + } + local_set(*idx, simplified_val) } @@ -2770,6 +2953,10 @@ pub fn rewrite_with_dataflow(val: Value, env: &mut OptimizationEnv) -> Value { // Look up in environment - dataflow analysis! if let Some(known_val) = env.locals.get(idx) { known_val.clone() + } else if let Some(pinned_val) = env.pinned.get(idx) { + // #219 carrier scalar-forwarding: forward the single-assignment + // local's defining expression to this use, exposing pack/unpack. + pinned_val.clone() } else { local_get(*idx) } @@ -2787,6 +2974,13 @@ pub fn rewrite_with_dataflow(val: Value, env: &mut OptimizationEnv) -> Value { env.locals.remove(idx); } + // #219 (see LocalSet): invalidate pins referencing `idx`, then pin the + // single-assignment carrier. local.tee both stores AND leaves the value. + invalidate_pins_referencing(env, *idx); + if env.single_assign.contains(idx) && is_forwardable_expr(&simplified_val) { + env.pinned.insert(*idx, simplified_val.clone()); + } + local_tee(*idx, simplified_val) } @@ -3192,6 +3386,13 @@ pub fn rewrite_with_dataflow(val: Value, env: &mut OptimizationEnv) -> Value { // After if: clear env. We don't know which branch was taken. env.locals.clear(); env.invalidate_memory(); + // #219 reaching-defs across the fork: keep a pin only if it SURVIVED + // in BOTH branches. A branch that reassigned one of the pin's input + // locals dropped it (via invalidate_pins_referencing during that + // branch), so the intersection drops any pin whose inputs could have + // changed on either path — sound regardless of which branch ran. + env.pinned + .retain(|k, _| then_env.pinned.contains_key(k) && else_env.pinned.contains_key(k)); Value(Box::new(ValueData::If { label: label.clone(), block_type: block_type.clone(), From d52c2a51d0defce254ce1bbea285727f250401ba Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Thu, 18 Jun 2026 09:52:40 +0200 Subject: [PATCH 16/20] feat(219): forward_carrier_locals pass (conservative top-level gate; mechanism wired) Activates the #219 carrier scalar-forwarding via the dormant mechanism (@186d236): new optimize pass forward_carrier_locals + `forward-carrier` CLI pass (after inline) + a call in optimize_module Phase 1b. Per function it forwards locals with exactly one write whose single write is at the FUNCTION TOP LEVEL (def dominates all uses), then rewrite_pure applies the SROA rules; verify_or_revert guards (gale silicon is the behavioral gate for the void seam). Full lib suite 413/0; fmt + clippy clean. KNOWN-CONSERVATIVE (does not yet dissolve the sem_give seam): the seam carrier (local.tee 3 in inlined z_impl) is NESTED in `block @2`, not top level, so the top-level proxy excludes it. Verified by hand it IS sound to forward -- the use `local.get 3` is reached only via `br_if @2` positioned AFTER the tee, so the def dominates the use (overflow arm traps via `unreachable`, never reaching it), and inputs local4/local7 are set only before the def. Admitting it needs a real TRAP-AWARE STRUCTURED DOMINANCE check to replace the top-level proxy (next); then the dual silicon flash (inline-only vs inline+teardown) validates. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-cli/src/main.rs | 12 +++++ loom-core/src/lib.rs | 101 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/loom-cli/src/main.rs b/loom-cli/src/main.rs index 8f18e6f..8318253 100644 --- a/loom-cli/src/main.rs +++ b/loom-cli/src/main.rs @@ -502,6 +502,18 @@ fn optimize_command( track_pass("inline", before, after); } + // #219: dissolve the u64 ABI carrier the inline leaves behind (scalar-forward + // the single-assignment carrier to its unpack sites, then SROA). Runs right + // after inline so the carrier is fresh; downstream dce/dead-locals reap it. + if should_run("forward-carrier") { + println!(" Running: forward-carrier"); + let before = count_instructions(&module); + loom_core::optimize::forward_carrier_locals(&mut module) + .context("Carrier forwarding failed")?; + let after = count_instructions(&module); + track_pass("forward-carrier", before, after); + } + if should_run("precompute") { println!(" Running: precompute"); let before = count_instructions(&module); diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 6667327..38b3de6 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -6953,6 +6953,11 @@ pub mod optimize { // are inlined, enabling subsequent passes to optimize across boundaries. inline_functions(module)?; + // Phase 1b (#219): dissolve the u64 ABI carrier the inline leaves behind — + // scalar-forward the single-assignment carrier to its unpack sites so the + // SROA rules can collapse the pack/unpack round-trip. + forward_carrier_locals(module)?; + // Phase 2: Constant folding (ISLE pattern rewrites) constant_folding(module)?; @@ -8940,6 +8945,102 @@ pub mod optimize { Ok(()) } + /// #219 carrier scalar-forwarding (the perf milestone): forward a TOP-LEVEL + /// single-assignment local's pure defining expression to its use sites, then + /// let the committed SROA rules dissolve the now-exposed pack/unpack. This + /// removes the u64 ABI carrier the proven `br_table` seam inline leaves inside + /// `z_impl` (decide builds `extend_i32_u<<32 | status`, z_impl tears it back + /// with `&0xff` / `>>32` through a dead i64 carrier). + /// + /// SOUNDNESS (this transform is NOT Z3-backstoppable for the void seam fn — + /// the value flows only into a havoc'd impure-call arg → vacuous pass; gale's + /// G474RE silicon is the behavioral gate, #219): + /// - We forward ONLY locals with exactly one write in the whole function + /// (`analyze_locals` `sets.len()==1`) whose single write is at the + /// FUNCTION TOP LEVEL (not nested in any Block/If/Loop). Top-level + /// single-assignment ⇒ the def dominates every use (you can't reach a + /// later top-level statement without executing the earlier one). + /// - The forwarding mechanism (`rewrite_with_dataflow` + `OptimizationEnv`) + /// additionally invalidates a forwarding when any input local is + /// reassigned (reaching-defs), so a forwarded expression always carries + /// its def-time inputs. + /// - `is_forwardable_expr` admits only pure ops (no calls/loads/stores/ + /// div-rem/floats), so re-evaluating the expression at a use is safe. + /// + /// `verify_or_revert` still guards non-void functions and stack correctness. + pub fn forward_carrier_locals(module: &mut Module) -> Result<()> { + use crate::stack::validation::{ValidationContext, ValidationGuard}; + use crate::verify::{TranslationValidator, VerificationSignatureContext}; + use loom_isle::{LocalEnv, rewrite_pure, rewrite_with_dataflow}; + use std::collections::{BTreeMap, HashSet}; + + let ctx = ValidationContext::from_module(module); + let verify_sig_ctx = VerificationSignatureContext::from_module(module); + + for func in &mut module.functions { + if has_unknown_instructions(func) || has_unsupported_isle_instructions(func) { + continue; + } + + // Precompute TOP-LEVEL single-assignment locals. + let (usage, _equiv) = analyze_locals(&func.instructions); + let mut top_level_writes: BTreeMap = BTreeMap::new(); + for instr in &func.instructions { + if let Instruction::LocalSet(i) | Instruction::LocalTee(i) = instr { + *top_level_writes.entry(*i).or_insert(0) += 1; + } + } + let single_assign: HashSet = usage + .iter() + .filter(|(idx, u)| { + u.sets.len() == 1 && top_level_writes.get(*idx).copied().unwrap_or(0) == 1 + }) + .map(|(idx, _)| *idx) + .collect(); + if single_assign.is_empty() { + continue; + } + + let guard = ValidationGuard::with_context(func, "forward_carrier_locals", ctx.clone()); + let translator = TranslationValidator::new_with_context( + func, + "forward_carrier_locals", + verify_sig_ctx.clone(), + ); + let original_instructions = func.instructions.clone(); + + let had_end = func.instructions.last() == Some(&Instruction::End); + if let Ok(terms) = super::terms::instructions_to_terms(&func.instructions) { + if !terms.is_empty() { + let mut env = LocalEnv::new(); + env.single_assign = single_assign; + // Forward the carrier (dataflow), then apply the structural + // SROA rules (rewrite_pure) to dissolve the exposed pack/unpack. + let forwarded: Vec<_> = terms + .into_iter() + .map(|t| rewrite_with_dataflow(t, &mut env)) + .map(rewrite_pure) + .collect(); + if let Ok(mut new_instrs) = super::terms::terms_to_instructions(&forwarded) { + if !new_instrs.is_empty() { + if !had_end && new_instrs.last() == Some(&Instruction::End) { + new_instrs.pop(); + } + func.instructions = new_instrs; + } + } + } + } + + if guard.validate(func).is_err() || translator.verify(func).is_err() { + eprintln!("forward_carrier_locals: reverting function (verification rejected)"); + crate::stats::record_revert("forward_carrier_locals"); + func.instructions = original_instructions; + } + } + Ok(()) + } + #[derive(Debug, Clone)] struct LocalUsage { // Positions where this local is read (local.get) From ef884e96cfc0760b74fc1fd77db3c114b847a6d0 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Thu, 18 Jun 2026 11:29:10 +0200 Subject: [PATCH 17/20] feat(219): trap-aware structured dominance for carrier forwarding (gate generalized) Replaces the conservative top-level-only proxy in forward_carrier_locals with carrier_def_dominates_uses: a single-assignment local is forwarded iff its def DOMINATES all uses by a sound structured-CFG argument (def not under If/Loop; all uses textually after it; no pre-def Br/BrIf/BrTable escapes the def's enclosing blocks; returns/unreachable before the def are safe). Admits the nested-but- dominating seam carrier the top-level proxy excluded. Soundness over completeness (a wrong `true` = silent miscompile). Full lib suite 414/0 incl. test_carrier_ dominance (straight-line, use-before-def, def-in-If, nested-dominating, escaping-br). fmt + clippy clean. NOT YET dissolving the e2e seam: inline+forward-carrier leaves z_impl BYTE-IDENTICAL to inline-only (same hash) -- the pass makes no NET change to z_impl. The dominance gate is no longer the blocker; the pass is either finding single_assign empty for z_impl OR forwarding-then-reverting (revert restores the identical inline body) OR the SROA rules not firing on forwarded terms (instructions_to_terms DOES handle z_impl's Call/load/br_table, so not a conversion skip). Next: instrument forward_carrier_locals (single_assign contents / changed? / reverted?) to pinpoint. Refs #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 227 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 213 insertions(+), 14 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index 38b3de6..a3e7cdc 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -8945,6 +8945,123 @@ pub mod optimize { Ok(()) } + /// #219: trap-aware structured dominance for carrier scalar-forwarding. + /// Returns true iff `local`'s SINGLE def dominates ALL its uses, so its + /// defining expression can be forwarded to every use. Conservative + /// structured-CFG argument (rejects on any uncertainty — soundness over + /// completeness; a wrong `true` here is a silent miscompile): + /// - exactly one write of `local`, and it is NOT inside any `If`/`Loop` + /// (all its enclosing blocks are plain `Block`/function root, hence + /// unconditionally entered); + /// - every use is textually AFTER the def; + /// - no `Br`/`BrIf`/`BrTable` textually BEFORE the def targets a block that + /// ENCLOSES the def (an "ancestor"): such a branch could deliver control + /// to the def's enclosing-block continuation — possibly a use — without + /// passing the def. Branches to inner blocks that close before the def + /// (or to the function frame = return, which terminates) are safe, as is + /// any `unreachable`/`return` before the def (they never reach a use). + /// + /// Together these make the def reached on every non-trapping path from entry, + /// so it dominates all (textually-later) uses. + pub(crate) fn carrier_def_dominates_uses(instrs: &[Instruction], local: u32) -> bool { + #[derive(Default)] + struct Scan { + seq: usize, + next_block_id: usize, + def_seq: Option, + def_ancestors: std::collections::HashSet, + def_disqualified: bool, + uses: Vec, + // (branch seq, resolved target block ids). usize::MAX = function frame + // (return-equivalent) — never reaches a use. + branches: Vec<(usize, Vec)>, + } + // Resolve a relative branch depth to the open block's id (innermost = + // depth 0); usize::MAX if it targets the function frame. + fn resolve(open: &[(usize, bool)], depth: u32) -> usize { + let d = depth as usize; + if d < open.len() { + open[open.len() - 1 - d].0 + } else { + usize::MAX + } + } + fn walk(instrs: &[Instruction], s: &mut Scan, open: &mut Vec<(usize, bool)>, local: u32) { + for instr in instrs { + s.seq += 1; + match instr { + Instruction::LocalSet(i) | Instruction::LocalTee(i) if *i == local => { + if s.def_seq.is_some() { + s.def_disqualified = true; // more than one write + } else { + s.def_seq = Some(s.seq); + s.def_ancestors = open.iter().map(|(id, _)| *id).collect(); + if open.iter().any(|(_, conditional)| *conditional) { + s.def_disqualified = true; // def under an If/Loop + } + } + } + Instruction::LocalGet(i) if *i == local => s.uses.push(s.seq), + Instruction::Br(d) | Instruction::BrIf(d) => { + s.branches.push((s.seq, vec![resolve(open, *d)])); + } + Instruction::BrTable { targets, default } => { + let mut ids: Vec = + targets.iter().map(|d| resolve(open, *d)).collect(); + ids.push(resolve(open, *default)); + s.branches.push((s.seq, ids)); + } + Instruction::Block { body, .. } => { + let id = s.next_block_id; + s.next_block_id += 1; + open.push((id, false)); + walk(body, s, open, local); + open.pop(); + } + Instruction::Loop { body, .. } => { + let id = s.next_block_id; + s.next_block_id += 1; + open.push((id, true)); // back-edge → conservative + walk(body, s, open, local); + open.pop(); + } + Instruction::If { + then_body, + else_body, + .. + } => { + let id = s.next_block_id; + s.next_block_id += 1; + open.push((id, true)); + walk(then_body, s, open, local); + walk(else_body, s, open, local); + open.pop(); + } + _ => {} + } + } + } + let mut s = Scan::default(); + let mut open: Vec<(usize, bool)> = Vec::new(); + walk(instrs, &mut s, &mut open, local); + + let def_seq = match s.def_seq { + Some(seq) if !s.def_disqualified => seq, + _ => return false, + }; + // Every use must be after the def. + if s.uses.iter().any(|&u| u <= def_seq) { + return false; + } + // No pre-def branch may escape to a def-ancestor block. + for (bseq, tgts) in &s.branches { + if *bseq < def_seq && tgts.iter().any(|t| s.def_ancestors.contains(t)) { + return false; + } + } + true + } + /// #219 carrier scalar-forwarding (the perf milestone): forward a TOP-LEVEL /// single-assignment local's pure defining expression to its use sites, then /// let the committed SROA rules dissolve the now-exposed pack/unpack. This @@ -8955,11 +9072,11 @@ pub mod optimize { /// SOUNDNESS (this transform is NOT Z3-backstoppable for the void seam fn — /// the value flows only into a havoc'd impure-call arg → vacuous pass; gale's /// G474RE silicon is the behavioral gate, #219): - /// - We forward ONLY locals with exactly one write in the whole function - /// (`analyze_locals` `sets.len()==1`) whose single write is at the - /// FUNCTION TOP LEVEL (not nested in any Block/If/Loop). Top-level - /// single-assignment ⇒ the def dominates every use (you can't reach a - /// later top-level statement without executing the earlier one). + /// - We forward ONLY locals with exactly one write whose def DOMINATES all + /// uses, by `carrier_def_dominates_uses` (trap-aware structured + /// dominance: def not under If/Loop, all uses after it, no pre-def branch + /// escapes the def's enclosing blocks). The def is then reached on every + /// non-trapping path from entry ⇒ dominates every use. /// - The forwarding mechanism (`rewrite_with_dataflow` + `OptimizationEnv`) /// additionally invalidates a forwarding when any input local is /// reassigned (reaching-defs), so a forwarded expression always carries @@ -8972,7 +9089,7 @@ pub mod optimize { use crate::stack::validation::{ValidationContext, ValidationGuard}; use crate::verify::{TranslationValidator, VerificationSignatureContext}; use loom_isle::{LocalEnv, rewrite_pure, rewrite_with_dataflow}; - use std::collections::{BTreeMap, HashSet}; + use std::collections::HashSet; let ctx = ValidationContext::from_module(module); let verify_sig_ctx = VerificationSignatureContext::from_module(module); @@ -8982,18 +9099,15 @@ pub mod optimize { continue; } - // Precompute TOP-LEVEL single-assignment locals. + // Precompute the forwardable carriers: single-assignment locals whose + // (one) def DOMINATES all uses by trap-aware structured dominance — + // see carrier_def_dominates_uses. (analyze_locals' `sets.len()==1` is + // the fast pre-filter; the dominance check is the soundness gate.) let (usage, _equiv) = analyze_locals(&func.instructions); - let mut top_level_writes: BTreeMap = BTreeMap::new(); - for instr in &func.instructions { - if let Instruction::LocalSet(i) | Instruction::LocalTee(i) = instr { - *top_level_writes.entry(*i).or_insert(0) += 1; - } - } let single_assign: HashSet = usage .iter() .filter(|(idx, u)| { - u.sets.len() == 1 && top_level_writes.get(*idx).copied().unwrap_or(0) == 1 + u.sets.len() == 1 && carrier_def_dominates_uses(&func.instructions, **idx) }) .map(|(idx, _)| *idx) .collect(); @@ -14711,6 +14825,91 @@ mod tests { // Just test that ISLE types are accessible } + // #219: trap-aware structured dominance for carrier forwarding. + #[test] + fn test_carrier_dominance() { + use crate::optimize::carrier_def_dominates_uses; + let dom = |wat: &str, local: u32| -> bool { + let m = parse::parse_wat(wat).unwrap(); + carrier_def_dominates_uses(&m.functions[0].instructions, local) + }; + + // Straight-line: def (local 1) then use → dominates. + assert!( + dom( + "(module (func (param i32) (result i32) (local i32) + local.get 0 local.set 1 local.get 1))", + 1 + ), + "straight-line def-before-use must dominate" + ); + + // Use before def → not dominated. + assert!( + !dom( + "(module (func (param i32) (result i32) (local i32) + local.get 1 drop local.get 0 local.set 1 local.get 1))", + 1 + ), + "a use before the def must NOT be dominated" + ); + + // Def inside an If branch → not dominated (conditional). + assert!( + !dom( + "(module (func (param i32) (result i32) (local i32) + local.get 0 + if (result i32) local.get 0 local.set 1 i32.const 1 else i32.const 0 end + local.get 1 i32.add))", + 1 + ), + "a def inside an If branch must NOT be dominated" + ); + + // Seam shape: def nested in a block, reached after an inner dispatch + // block whose branch targets that INNER block (does not escape), use + // after the outer block via a br_if positioned AFTER the def → dominates. + assert!( + dom( + "(module (func (param i32) (result i32) (local i32) + block (result i32) + block + local.get 0 + br_if 0 + end + local.get 0 + local.set 1 + local.get 0 + br_if 1 + i32.const 7 + end + local.get 1 + i32.add))", + 1 + ), + "nested def reached past an inner-only branch, used after, must dominate" + ); + + // Escape shape: a branch BEFORE the def targets the def's ENCLOSING block + // (exits past where the def lives) → not dominated. + assert!( + !dom( + "(module (func (param i32) (result i32) (local i32) + block (result i32) + local.get 0 + br_if 0 + local.get 0 + local.set 1 + i32.const 7 + end + local.get 1 + i32.add))", + 1 + ), + "a pre-def branch escaping the def's enclosing block must NOT dominate" + ); + } + #[test] fn test_parse_wat_simple() { let wat = r#" From 14f5b1a8f40a4b5277bff241030d4807cb1cd751 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Thu, 18 Jun 2026 23:19:42 +0200 Subject: [PATCH 18/20] feat(219): instruction-level carrier forwarding + windowed SROA (br_table-safe seam teardown) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The term IR round-trip is lossy for control flow (br_if/br_table are pushed as value terms), so term-based carrier forwarding turns a br_table function like z_impl stack-invalid. Replace it with an instruction-level transform: - simplify_pure_windows: apply rewrite_pure only to maximal pure straight-line windows (which round-trip faithfully), never across branches. - extract_carrier_rhs + forward_carrier_in_body: forward a single-assignment dominating carrier's pure RHS to its uses, then drop the dead def (tee keeps the value on the stack; set drops set+RHS). - never-pessimize size guard: revert any forwarding that does not shrink the fn. - rewrite_pure: extend_i32_u(x) & M -> extend_i32_u(x) when M's low 32 bits are all set (Z3-sound — a zero-extend already fits in 32 bits). On the sem seam the &0xff branch fully dissolves and the shl/tee vanish; a status>>32 residual remains pending a narrow-local value analysis. Sound, validates, and net-shrinks (verify_or_revert + size guard). gale G474RE silicon is the behavioral gate. Refs: #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 393 ++++++++++++++++++++++++++++++++++++----- loom-shared/src/lib.rs | 15 ++ 2 files changed, 364 insertions(+), 44 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index a3e7cdc..aa8b58f 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -9062,6 +9062,262 @@ pub mod optimize { true } + /// #219 STEP 1 — is this instruction a pure, side-effect-free, integer-domain + /// value computation whose term round-trip is faithful? + /// + /// The term IR is a value-expression stack model: control-flow ops (`br_if`/ + /// `br_table`) are pushed as value terms, so a mid-block branch followed by + /// trailing stack computation does NOT round-trip (`instructions_to_terms` → + /// `terms_to_instructions` is lossy — see #219). But a maximal contiguous run + /// of these *pure* ops round-trips faithfully, so we can safely apply the + /// structural rewriter (`rewrite_pure`) to such a "window". We restrict to the + /// integer domain (no floats — avoids NaN-bit canonicalization questions) and + /// exclude memory access, calls, control flow, and local/global writes. + fn is_simple_pure_instr(instr: &Instruction) -> bool { + use Instruction::*; + matches!( + instr, + // Constants (integer only) + I32Const(_) | I64Const(_) + // Reads (no side effects, no writes) + | LocalGet(_) | GlobalGet(_) + // Integer binary ops (consume 2, produce 1) + | I32Add | I32Sub | I32Mul | I32DivS | I32DivU | I32RemS | I32RemU + | I32And | I32Or | I32Xor | I32Shl | I32ShrS | I32ShrU | I32Rotl | I32Rotr + | I32Eq | I32Ne | I32LtS | I32LtU | I32GtS | I32GtU | I32LeS | I32LeU | I32GeS | I32GeU + | I64Add | I64Sub | I64Mul | I64DivS | I64DivU | I64RemS | I64RemU + | I64And | I64Or | I64Xor | I64Shl | I64ShrS | I64ShrU | I64Rotl | I64Rotr + | I64Eq | I64Ne | I64LtS | I64LtU | I64GtS | I64GtU | I64LeS | I64LeU | I64GeS | I64GeU + // Integer unary ops (consume 1, produce 1) + | I32Eqz | I32Clz | I32Ctz | I32Popcnt + | I64Eqz | I64Clz | I64Ctz | I64Popcnt + | I32WrapI64 | I64ExtendI32S | I64ExtendI32U + | I32Extend8S | I32Extend16S | I64Extend8S | I64Extend16S | I64Extend32S + // Select (consume 3, produce 1) — pure + | Select + ) + } + + /// Net stack effect (produced − consumed) of an instruction sequence. + fn net_stack_effect(instrs: &[Instruction]) -> i32 { + instrs + .iter() + .map(|i| { + let (c, p) = instruction_stack_io(i); + p - c + }) + .sum() + } + + /// Total instruction count, recursing into nested bodies. Used as a + /// never-pessimize guard: forwarding that does not shrink the function is + /// reverted (a single-use carrier whose RHS does not dissolve would + /// otherwise grow the code). + fn count_instrs(instrs: &[Instruction]) -> usize { + let mut n = 0; + for i in instrs { + n += 1; + match i { + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + n += count_instrs(body); + } + Instruction::If { + then_body, + else_body, + .. + } => { + n += count_instrs(then_body) + count_instrs(else_body); + } + _ => {} + } + } + n + } + + /// #219 STEP 1 — rewrite a single pure window through the term IR. + /// + /// Returns `Some(new_instrs)` only when the window is stack-closed + /// (`instructions_to_terms` succeeds — no underflow, i.e. it does not consume + /// values produced before the window) AND the rewrite preserves the net stack + /// effect. Otherwise `None` (leave the window untouched — conservative). + fn rewrite_pure_window(window: &[Instruction]) -> Option> { + use loom_isle::rewrite_pure; + let terms = super::terms::instructions_to_terms(window).ok()?; + if terms.is_empty() { + return None; + } + let rewritten: Vec<_> = terms.into_iter().map(rewrite_pure).collect(); + let new_instrs = super::terms::terms_to_instructions(&rewritten).ok()?; + if net_stack_effect(window) != net_stack_effect(&new_instrs) { + return None; + } + Some(new_instrs) + } + + /// #219 STEP 1 — WINDOWED SROA. Apply the structural rewriter (`rewrite_pure`) + /// to every maximal pure straight-line window in `instrs`, recursing into + /// nested bodies first. This is the SROA path that survives `br_table` + /// functions like `z_impl`: it NEVER round-trips a control-flow instruction + /// through the (lossy) term IR — only self-contained pure runs, which are + /// faithful. `rewrite_pure` is the proven structural rule set, so each window + /// rewrite is semantics-preserving; the net-stack-effect guard in + /// `rewrite_pure_window` keeps the surrounding stack shape intact. + pub(crate) fn simplify_pure_windows(instrs: &mut Vec) { + // Recurse into nested bodies first. + for instr in instrs.iter_mut() { + match instr { + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + simplify_pure_windows(body); + } + Instruction::If { + then_body, + else_body, + .. + } => { + simplify_pure_windows(then_body); + simplify_pure_windows(else_body); + } + _ => {} + } + } + + // Rewrite maximal pure windows in THIS body. + let mut out: Vec = Vec::with_capacity(instrs.len()); + let mut i = 0; + while i < instrs.len() { + if is_simple_pure_instr(&instrs[i]) { + let start = i; + while i < instrs.len() && is_simple_pure_instr(&instrs[i]) { + i += 1; + } + let window = &instrs[start..i]; + match rewrite_pure_window(window) { + Some(rewritten) => out.extend(rewritten), + None => out.extend_from_slice(window), + } + } else { + out.push(instrs[i].clone()); + i += 1; + } + } + *instrs = out; + } + + /// #219 STEP 2 — locate the single def of `carrier` and extract its pure + /// defining expression (the RHS), recursing into nested bodies. + /// + /// Returns `(rhs_instrs, is_tee)` where `rhs_instrs` is the maximal pure, + /// stack-closed, single-value (`net == +1`) instruction run immediately + /// preceding the def — i.e. exactly the expression the `local.set`/`local.tee` + /// stores. Returns `None` if the def isn't found or its RHS isn't a pure, + /// self-contained single value. The caller has already established (via + /// `carrier_def_dominates_uses`) that `carrier` is single-assignment and its + /// def is not under an `If`/`Loop`, so there is exactly one def to find. + fn extract_carrier_rhs( + instrs: &[Instruction], + carrier: u32, + ) -> Option<(Vec, bool)> { + for (d, instr) in instrs.iter().enumerate() { + match instr { + Instruction::LocalSet(i) | Instruction::LocalTee(i) if *i == carrier => { + let is_tee = matches!(instr, Instruction::LocalTee(_)); + // Maximal pure run ending at the def. + let mut run_start = d; + while run_start > 0 && is_simple_pure_instr(&instrs[run_start - 1]) { + run_start -= 1; + } + // Longest suffix [s..d] that is a single self-contained value + // (round-trips through the term IR as exactly one term). + for s in run_start..d { + let cand = &instrs[s..d]; + if let Ok(terms) = super::terms::instructions_to_terms(cand) { + if terms.len() == 1 && net_stack_effect(cand) == 1 { + return Some((cand.to_vec(), is_tee)); + } + } + } + return None; + } + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + if let Some(r) = extract_carrier_rhs(body, carrier) { + return Some(r); + } + } + Instruction::If { + then_body, + else_body, + .. + } => { + if let Some(r) = extract_carrier_rhs(then_body, carrier) + .or_else(|| extract_carrier_rhs(else_body, carrier)) + { + return Some(r); + } + } + _ => {} + } + } + None + } + + /// #219 STEP 2 — forward `carrier`'s pure RHS to every use and drop the def, + /// recursing into nested bodies. + /// + /// - Each `local.get carrier` is replaced by a clone of `rhs` (pure ⇒ + /// re-evaluating at the use is sound). + /// - The def is removed: a `local.tee` is dropped but its RHS run is LEFT in + /// place (its value stays on the stack for the original consumer); a + /// `local.set` is dropped together with its preceding RHS run (now-dead pure + /// computation — sound to delete). + /// + /// The carrier local becomes dead afterwards; later DCE/dead-local passes + /// remove its declaration. The exposed pack/unpack is dissolved by a + /// subsequent `simplify_pure_windows`. + fn forward_carrier_in_body( + body: &mut Vec, + carrier: u32, + rhs: &[Instruction], + is_tee: bool, + ) { + // Recurse into nested bodies first so uses there are substituted. + for instr in body.iter_mut() { + match instr { + Instruction::Block { body: b, .. } | Instruction::Loop { body: b, .. } => { + forward_carrier_in_body(b, carrier, rhs, is_tee); + } + Instruction::If { + then_body, + else_body, + .. + } => { + forward_carrier_in_body(then_body, carrier, rhs, is_tee); + forward_carrier_in_body(else_body, carrier, rhs, is_tee); + } + _ => {} + } + } + + let mut out: Vec = Vec::with_capacity(body.len()); + for instr in body.drain(..) { + match &instr { + Instruction::LocalGet(i) if *i == carrier => { + out.extend_from_slice(rhs); + } + Instruction::LocalSet(i) if *i == carrier && !is_tee => { + // Drop the dead set and its preceding RHS run. + let drop_n = rhs.len().min(out.len()); + out.truncate(out.len() - drop_n); + } + Instruction::LocalTee(i) if *i == carrier && is_tee => { + // Drop the tee; its RHS value stays on the stack for the + // original consumer. + } + _ => out.push(instr), + } + } + *body = out; + } + /// #219 carrier scalar-forwarding (the perf milestone): forward a TOP-LEVEL /// single-assignment local's pure defining expression to its use sites, then /// let the committed SROA rules dissolve the now-exposed pack/unpack. This @@ -9077,18 +9333,21 @@ pub mod optimize { /// dominance: def not under If/Loop, all uses after it, no pre-def branch /// escapes the def's enclosing blocks). The def is then reached on every /// non-trapping path from entry ⇒ dominates every use. - /// - The forwarding mechanism (`rewrite_with_dataflow` + `OptimizationEnv`) - /// additionally invalidates a forwarding when any input local is - /// reassigned (reaching-defs), so a forwarded expression always carries - /// its def-time inputs. - /// - `is_forwardable_expr` admits only pure ops (no calls/loads/stores/ - /// div-rem/floats), so re-evaluating the expression at a use is safe. + /// - The forwarding is done at the INSTRUCTION level + /// (`extract_carrier_rhs` + `forward_carrier_in_body`), NOT through the + /// term IR: the term round-trip is lossy for control flow (it pushes + /// `br_if`/`br_table` as value terms), so any term-based forwarding turns + /// a `br_table` function like `z_impl` stack-invalid (#219). Only the + /// carrier's pure RHS — itself a self-contained straight-line run — is + /// ever taken through terms, and only via `simplify_pure_windows` for the + /// final SROA dissolution, which never round-trips control flow. + /// - `extract_carrier_rhs` admits only a pure (`is_simple_pure_instr`) + /// single-value run, so re-evaluating the expression at a use is safe. /// /// `verify_or_revert` still guards non-void functions and stack correctness. pub fn forward_carrier_locals(module: &mut Module) -> Result<()> { use crate::stack::validation::{ValidationContext, ValidationGuard}; use crate::verify::{TranslationValidator, VerificationSignatureContext}; - use loom_isle::{LocalEnv, rewrite_pure, rewrite_with_dataflow}; use std::collections::HashSet; let ctx = ValidationContext::from_module(module); @@ -9123,32 +9382,30 @@ pub mod optimize { ); let original_instructions = func.instructions.clone(); - let had_end = func.instructions.last() == Some(&Instruction::End); - if let Ok(terms) = super::terms::instructions_to_terms(&func.instructions) { - if !terms.is_empty() { - let mut env = LocalEnv::new(); - env.single_assign = single_assign; - // Forward the carrier (dataflow), then apply the structural - // SROA rules (rewrite_pure) to dissolve the exposed pack/unpack. - let forwarded: Vec<_> = terms - .into_iter() - .map(|t| rewrite_with_dataflow(t, &mut env)) - .map(rewrite_pure) - .collect(); - if let Ok(mut new_instrs) = super::terms::terms_to_instructions(&forwarded) { - if !new_instrs.is_empty() { - if !had_end && new_instrs.last() == Some(&Instruction::End) { - new_instrs.pop(); - } - func.instructions = new_instrs; - } - } + // Instruction-level forwarding (br_table-safe): for each forwardable + // carrier, extract its pure single-value RHS, substitute it into every + // use, and drop the dead def. Process carriers deterministically. + let mut carriers: Vec = single_assign.into_iter().collect(); + carriers.sort_unstable(); + for carrier in carriers { + if let Some((rhs, is_tee)) = extract_carrier_rhs(&func.instructions, carrier) { + forward_carrier_in_body(&mut func.instructions, carrier, &rhs, is_tee); + } + } + // Dissolve the exposed pack/unpack with windowed SROA (pure windows + // only — never round-trips control flow). + simplify_pure_windows(&mut func.instructions); + + // Revert unless: stack/verify pass AND the function did not grow + // (never pessimize — forwarding that fails to dissolve is dropped). + let grew = count_instrs(&func.instructions) > count_instrs(&original_instructions); + if grew || guard.validate(func).is_err() || translator.verify(func).is_err() { + if grew { + crate::stats::record_revert("forward_carrier_locals_no_shrink"); + } else { + eprintln!("forward_carrier_locals: reverting function (verification rejected)"); + crate::stats::record_revert("forward_carrier_locals"); } - } - - if guard.validate(func).is_err() || translator.verify(func).is_err() { - eprintln!("forward_carrier_locals: reverting function (verification rejected)"); - crate::stats::record_revert("forward_carrier_locals"); func.instructions = original_instructions; } } @@ -14910,6 +15167,58 @@ mod tests { ); } + // #219 STEP 1: windowed SROA — rewrite_pure applied to maximal pure + // straight-line windows, br_table-safe (never round-trips control flow). + #[test] + fn test_windowed_sroa() { + use crate::optimize::simplify_pure_windows; + + // A pure window dissolves: wrap_i64(extend_i32_u(x)) = x (Z3-validated + // seam-SROA rule). The 3-instruction window collapses to a single get. + let mut w = vec![ + Instruction::LocalGet(0), + Instruction::I64ExtendI32U, + Instruction::I32WrapI64, + ]; + simplify_pure_windows(&mut w); + assert_eq!( + w, + vec![Instruction::LocalGet(0)], + "pure window wrap(extend_u(x)) must dissolve to x" + ); + + // A window ADJACENT to a br_table is rewritten on its pure parts only; + // the br_table itself (and block structure) is preserved untouched — + // the lossy term round-trip is never invoked on control flow. + let inner = vec![ + // pure prefix: 2 + 3 → 5 (const-fold), then the index for br_table + Instruction::LocalGet(0), + Instruction::I64ExtendI32U, + Instruction::I32WrapI64, // dissolves to local.get 0 + Instruction::BrTable { + targets: vec![0, 0], + default: 0, + }, + ]; + let mut body = vec![Instruction::Block { + block_type: BlockType::Empty, + body: inner, + }]; + simplify_pure_windows(&mut body); + // The Block + BrTable survive; the pure prefix inside is simplified. + match &body[0] { + Instruction::Block { body: b, .. } => { + assert_eq!(b[0], Instruction::LocalGet(0), "pure prefix simplified"); + assert!( + matches!(b.last(), Some(Instruction::BrTable { .. })), + "br_table preserved untouched, not round-tripped" + ); + assert_eq!(b.len(), 2, "window collapsed to [local.get 0, br_table]"); + } + other => panic!("expected Block, got {other:?}"), + } + } + #[test] fn test_parse_wat_simple() { let wat = r#" @@ -18285,28 +18594,24 @@ mod tests { #[test] fn test_seam_sroa_shr_extracts_high_field() { // #219 seam-SROA: (shr_u (or (shl (extend_u x) 32) const) 32) extracts the - // HIGH field of a u64 pack → (extend_u x) & 0xffffffff. The low (const) - // field shifts out (logical shift distributes over OR; shl-then-shr-same - // masks to the low 64-k bits). Mirrors the sem decide whose low field is - // a constant 0/1. + // HIGH field of a u64 pack → (extend_u x). The low (const) field shifts + // out (logical shift distributes over OR; shl-then-shr-same masks to the + // low 64-k bits, leaving (extend_u x) & 0xffffffff, which the extend-mask + // rule collapses to (extend_u x) since a zero-extend already fits in 32 + // bits). Mirrors the sem decide whose low field is a constant 0/1. use loom_isle::{ - Imm64, i64_extend_i32_u, iand64, iconst64, ior64, ishl64, ishru64, local_get, - rewrite_pure, + Imm64, i64_extend_i32_u, iconst64, ior64, ishl64, ishru64, local_get, rewrite_pure, }; let high = ishl64(i64_extend_i32_u(local_get(0)), iconst64(Imm64(32))); let pack = ior64(high, iconst64(Imm64(1))); // low field = const (like local 3 = 0/1) let unpacked = ishru64(pack, iconst64(Imm64(32))); let simplified = rewrite_pure(unpacked); - let want = terms::terms_to_instructions(&[iand64( - i64_extend_i32_u(local_get(0)), - iconst64(Imm64(0xffff_ffff)), - )]) - .unwrap(); + let want = terms::terms_to_instructions(&[i64_extend_i32_u(local_get(0))]).unwrap(); let got = terms::terms_to_instructions(&[simplified]).unwrap(); assert_eq!( got, want, - "#219: (shr_u (or (shl (extend_u x) 32) const) 32) must extract (extend_u x) & 0xffffffff" + "#219: (shr_u (or (shl (extend_u x) 32) const) 32) must extract (extend_u x)" ); } diff --git a/loom-shared/src/lib.rs b/loom-shared/src/lib.rs index ee26a9b..2fa7271 100644 --- a/loom-shared/src/lib.rs +++ b/loom-shared/src/lib.rs @@ -3992,6 +3992,21 @@ fn rewrite_pure_impl(val: Value) -> Value { { iconst64(Imm64(0)) } + // #219 seam-SROA: extend_i32_u(x) & M → extend_i32_u(x) when + // M's low 32 bits are all set. Zero-extending an i32 yields a + // value in [0, 2^32), so a mask covering bits [0,32) preserves + // it (bits [32,64) of the extend are already 0). Z3: (zext32 x) + // & M == (zext32 x) when (M & 0xffffffff) == 0xffffffff. + (ValueData::I64ExtendI32U { .. }, ValueData::I64Const { val: m }) + if (m.value() as u64) & 0xffff_ffff == 0xffff_ffff => + { + lhs_simplified + } + (ValueData::I64Const { val: m }, ValueData::I64ExtendI32U { .. }) + if (m.value() as u64) & 0xffff_ffff == 0xffff_ffff => + { + rhs_simplified + } // #219 seam-SROA: (or A B) & M → (survivor & M) when one OR // operand is a left shift the mask clears. Recurse so the // survivor (and a both-shifted case) simplifies further. From 6299d760450fda984a2873ae6b4ae3f94e10bd6d Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Fri, 19 Jun 2026 00:13:42 +0200 Subject: [PATCH 19/20] feat(219): narrow-local analysis completes the seam dissolution The forwarded unpack (extend(a)<<32 | status) >> 32 distributes (in rewrite_pure) to extend(a) | (status>>32) but can't finish on its own: status>>32 -> 0 needs the value-range fact status < 2^32, which context-free SROA can't see. Add a narrow-local analysis (loom-core mod optimize): - def_value_is_narrow / narrow_locals: a local is narrow (< 2^32, high 32 bits always 0) when EVERY def stores i64.extend_i32_u(_), an i64.const with the high 32 bits clear, or (_ & i64.const c) with c < 2^32. - fold_narrow_high_shr: peephole (shr_u (local.get N) k) -> i64.const 0 when N is narrow and (k mod 64) >= 32. - forward_carrier_locals iterates fold+windows to a fixpoint: simplify_pure_windows distributes the unpack and EXPOSES the bare status>>32, the next fold zeroes it, the next window collapses (or extend 0) -> extend. z_impl now FULLY dissolves: the store site becomes local.get 0; local.get a; i64.extend_i32_u; i64.store32 offset=8 storing the thread pointer directly. shl / tee / or / shr_u and the redundant mask are all gone. 416 lib tests green, e2e validates. Refs: #219 Co-Authored-By: Claude Opus 4.8 --- loom-core/src/lib.rs | 186 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 3 deletions(-) diff --git a/loom-core/src/lib.rs b/loom-core/src/lib.rs index aa8b58f..510b354 100644 --- a/loom-core/src/lib.rs +++ b/loom-core/src/lib.rs @@ -9203,6 +9203,119 @@ pub mod optimize { *instrs = out; } + /// #219 STEP 3 — is the value stored by the def at `instrs[def_idx]` + /// provably `< 2^32` (high 32 bits always zero)? + /// + /// Conservative, purely local: looks only at the instruction(s) immediately + /// preceding the `local.set`/`local.tee`. A stored value is narrow when it is + /// `i64.extend_i32_u(_)` (a zero-extend always fits in 32 bits), an + /// `i64.const c` with the high 32 bits clear, or `(_ & i64.const c)` with `c` + /// fitting in 32 bits (the mask clears bits ≥ 32). Anything else → not narrow. + fn def_value_is_narrow(instrs: &[Instruction], def_idx: usize) -> bool { + if def_idx == 0 { + return false; + } + match &instrs[def_idx - 1] { + Instruction::I64ExtendI32U => true, + Instruction::I64Const(c) => (*c as u64) >> 32 == 0, + // `; i64.const c; i64.and` — the AND's RHS const (pushed last) + // sits two before the set. A mask < 2^32 bounds the result. + Instruction::I64And => { + def_idx >= 2 + && matches!( + &instrs[def_idx - 2], + Instruction::I64Const(c) if (*c as u64) >> 32 == 0 + ) + } + _ => false, + } + } + + /// #219 STEP 3 — the set of locals provably `< 2^32` (high 32 bits always + /// zero): every write of the local stores a narrow value + /// (`def_value_is_narrow`), recursing into nested bodies. A local with no + /// write, or any non-narrow write, is excluded (conservative). + pub(crate) fn narrow_locals(instrs: &[Instruction]) -> std::collections::HashSet { + use std::collections::HashSet; + fn scan(instrs: &[Instruction], written: &mut HashSet, not_narrow: &mut HashSet) { + for (i, instr) in instrs.iter().enumerate() { + match instr { + Instruction::LocalSet(n) | Instruction::LocalTee(n) => { + written.insert(*n); + if !def_value_is_narrow(instrs, i) { + not_narrow.insert(*n); + } + } + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + scan(body, written, not_narrow); + } + Instruction::If { + then_body, + else_body, + .. + } => { + scan(then_body, written, not_narrow); + scan(else_body, written, not_narrow); + } + _ => {} + } + } + } + let mut written = HashSet::new(); + let mut not_narrow = HashSet::new(); + scan(instrs, &mut written, &mut not_narrow); + written.difference(¬_narrow).copied().collect() + } + + /// #219 STEP 3 — fold `(i64.shr_u (local.get N) k)` → `i64.const 0` when `N` + /// is narrow (`< 2^32`) and the effective shift `k mod 64 ≥ 32`, recursing + /// into nested bodies. + /// + /// Shifting a value whose high 32 bits are zero right by ≥ 32 yields 0. This + /// is the value-range fact pure structural SROA can't see; supplying it here + /// lets the subsequent `simplify_pure_windows` collapse `(or P 0) → P`, + /// completing the seam unpack `(extend(a)<<32 | status) >> 32 → extend(a)`. + pub(crate) fn fold_narrow_high_shr( + instrs: &mut Vec, + narrow: &std::collections::HashSet, + ) { + for instr in instrs.iter_mut() { + match instr { + Instruction::Block { body, .. } | Instruction::Loop { body, .. } => { + fold_narrow_high_shr(body, narrow); + } + Instruction::If { + then_body, + else_body, + .. + } => { + fold_narrow_high_shr(then_body, narrow); + fold_narrow_high_shr(else_body, narrow); + } + _ => {} + } + } + + let mut out: Vec = Vec::with_capacity(instrs.len()); + let mut i = 0; + while i < instrs.len() { + if i + 2 < instrs.len() { + if let (Instruction::LocalGet(n), Instruction::I64Const(k), Instruction::I64ShrU) = + (&instrs[i], &instrs[i + 1], &instrs[i + 2]) + { + if narrow.contains(n) && (*k as u64 & 63) >= 32 { + out.push(Instruction::I64Const(0)); + i += 3; + continue; + } + } + } + out.push(instrs[i].clone()); + i += 1; + } + *instrs = out; + } + /// #219 STEP 2 — locate the single def of `carrier` and extract its pure /// defining expression (the RHS), recursing into nested bodies. /// @@ -9392,9 +9505,24 @@ pub mod optimize { forward_carrier_in_body(&mut func.instructions, carrier, &rhs, is_tee); } } - // Dissolve the exposed pack/unpack with windowed SROA (pure windows - // only — never round-trips control flow). - simplify_pure_windows(&mut func.instructions); + // Dissolve the exposed pack/unpack. Iterate two passes to a fixpoint: + // - simplify_pure_windows runs rewrite_pure on pure windows, which + // DISTRIBUTES the unpack: (or P Q)>>32 → (P>>32) | (Q>>32), + // EXPOSING a bare `status>>32`; + // - fold_narrow_high_shr then supplies the value-range fact pure + // SROA can't see — (shr_u (local.get N) k≥32) → 0 for narrow + // (< 2^32) locals — letting the next window collapse (or P 0)→P. + // The bare shr_u only appears AFTER the first window pass distributes, + // so neither order alone suffices; iterate until stable (capped). + for _ in 0..4 { + let before = func.instructions.clone(); + let narrow = narrow_locals(&func.instructions); + fold_narrow_high_shr(&mut func.instructions, &narrow); + simplify_pure_windows(&mut func.instructions); + if func.instructions == before { + break; + } + } // Revert unless: stack/verify pass AND the function did not grow // (never pessimize — forwarding that fails to dissolve is dropped). @@ -15219,6 +15347,58 @@ mod tests { } } + // #219 STEP 3: narrow-local analysis + high-shift fold. + #[test] + fn test_narrow_locals_and_high_shr_fold() { + use crate::optimize::{fold_narrow_high_shr, narrow_locals}; + + // local 0: every def narrow (extend_u, then small const) → narrow. + // local 1: a def with the high 32 bits set (0x1_0000_0000) → NOT narrow. + // local 2: def via (_ & 0xff) const mask → narrow. + let instrs = vec![ + Instruction::LocalGet(9), + Instruction::I64ExtendI32U, + Instruction::LocalSet(0), + Instruction::I64Const(1), + Instruction::LocalSet(0), // local 0: both defs narrow + Instruction::I64Const(0x1_0000_0000), + Instruction::LocalSet(1), // local 1: wide const → not narrow + Instruction::LocalGet(9), + Instruction::I64ExtendI32U, + Instruction::I64Const(0xff), + Instruction::I64And, + Instruction::LocalSet(2), // local 2: masked by 0xff → narrow + ]; + let narrow = narrow_locals(&instrs); + assert!( + narrow.contains(&0), + "local 0 (extend + small const) is narrow" + ); + assert!(!narrow.contains(&1), "local 1 (wide const) is NOT narrow"); + assert!(narrow.contains(&2), "local 2 (& 0xff) is narrow"); + + // Fold: narrow local's >>32 → 0; non-narrow local's >>32 untouched. + let mut body = vec![ + Instruction::LocalGet(0), + Instruction::I64Const(32), + Instruction::I64ShrU, // narrow → folds to const 0 + Instruction::LocalGet(1), + Instruction::I64Const(32), + Instruction::I64ShrU, // not narrow → preserved + ]; + fold_narrow_high_shr(&mut body, &narrow); + assert_eq!( + body, + vec![ + Instruction::I64Const(0), + Instruction::LocalGet(1), + Instruction::I64Const(32), + Instruction::I64ShrU, + ], + "narrow >>32 folds to 0; non-narrow >>32 preserved" + ); + } + #[test] fn test_parse_wat_simple() { let wat = r#" From 1d284de698e35ab0b8de950f7b7b9f6fcdf9267d Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Wed, 1 Jul 2026 23:09:36 +0200 Subject: [PATCH 20/20] test(219): update test_rse_different_locals for merged pipeline full-fold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The origin/main merge combines #219's optimize_module phases (forward_carrier_locals) with main's stronger simplify_locals. Together they now fold `local x=30; local y=20; return x+y` all the way to `[I32Const(50)]` (both locals eliminated, add constant-folded) instead of leaving one surviving `$y` store. The old assertion expected exactly 1 store; the new fully-folded form is strictly more aggressive and provably equivalent (30 + 20 == 50). The behavioral differential gate certifies semantics PRESERVED on this module and on the whole CI corpus (httparse/json_lite/state_machine/ sem.loom: 4 certified, 0 diverged). Not a miscompile — a stale, over- specified intermediate-form expectation invalidated by the merge. Refs #219 Refs #238 Co-Authored-By: Claude Opus 4.8 --- loom-core/tests/optimization_tests.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/loom-core/tests/optimization_tests.rs b/loom-core/tests/optimization_tests.rs index f564aec..8706c43 100644 --- a/loom-core/tests/optimization_tests.rs +++ b/loom-core/tests/optimization_tests.rs @@ -1155,13 +1155,19 @@ fn test_rse_different_locals() { let wasm_bytes = encode::encode_wasm(&module).expect("Failed to encode"); wasmparser::validate(&wasm_bytes).expect("Generated WASM is invalid"); - // Only $y store remains (local $x is completely eliminated via optimization cascade) + // After the #219 merge the library pipeline (optimize_module now runs + // forward_carrier_locals + main's stronger simplify_locals) folds this whole + // function to its constant result: x=30, y=20, x+y == 50. Both locals are + // eliminated and the add is constant-folded, leaving `[I32Const(50)]` — 0 + // stores. This is strictly more aggressive than the old "1 store ($y) + // survives" expectation and is provably equivalent (30 + 20 == 50); the + // behavioral differential gate certifies semantics PRESERVED on this module. let instructions_str = format!("{:?}", module.functions[0].instructions); let store_count = instructions_str.matches("LocalSet").count() + instructions_str.matches("LocalTee").count(); assert_eq!( - store_count, 1, - "Expected 1 store ($y only - $x eliminated), got {} in: {:?}", + store_count, 0, + "Expected 0 stores (both locals eliminated, x+y folded to const 50), got {} in: {:?}", store_count, module.functions[0].instructions ); }