diff --git a/crates/synth-backend-riscv/src/selector.rs b/crates/synth-backend-riscv/src/selector.rs index 4a87e21..7eb327c 100644 --- a/crates/synth-backend-riscv/src/selector.rs +++ b/crates/synth-backend-riscv/src/selector.rs @@ -4,9 +4,10 @@ //! with trap-on-zero), loads/stores against linear memory, simple control //! flow (block / loop / if / br / br_if), and local variable access. //! +//! i64 divide / remainder is lowered to an inline software long-division +//! loop (RV32 has no 64-bit divide instruction) — see `lower_i64_div`. +//! //! Out of scope (see `select_simple` doc comments for the full list): -//! - i64 divide / remainder (Phase 3 — needs a `__divdi3`-style software -//! long-division routine; RV32 has no 64-bit divide instruction) //! - sign-extending sub-word i64 loads (`i64.load8_s` etc.) //! - F32/F64 (RV32F/D — not yet wired) //! - br_table (lowered in B3 alongside jump tables) @@ -454,14 +455,15 @@ impl Selector { I64Extend16S => self.lower_i64_extend_sub(op, 16)?, I64Extend32S => self.lower_i64_extend_sub(op, 32)?, - // i64 division / remainder is deferred to Phase 3 — RV32 has no - // 64-bit divide instruction, so these need a `__divdi3`-style - // software long-division routine (multi-block loop). Implementing - // it inline would balloon the selector; emitting it as a runtime - // helper needs a call-out the selector doesn't yet support. - // Until then they fall through to the `Unsupported` arm below — - // fail loudly rather than silently miscompile. - // I64DivS / I64DivU / I64RemS / I64RemU — Phase 3. + // ─── i64 division / remainder (Phase 3) ───────────────────── + // RV32's M extension only has 32-bit div/rem, so a 64-bit divide + // is lowered to an inline software long-division loop. See + // `emit_i64_udiv_inline` for the unsigned core; the signed ops + // wrap it with magnitude/sign handling. + I64DivS => self.lower_i64_div(op, /*signed=*/ true, /*want_rem=*/ false)?, + I64DivU => self.lower_i64_div(op, /*signed=*/ false, /*want_rem=*/ false)?, + I64RemS => self.lower_i64_div(op, /*signed=*/ true, /*want_rem=*/ true)?, + I64RemU => self.lower_i64_div(op, /*signed=*/ false, /*want_rem=*/ true)?, // ─── i64 memory ───────────────────────────────────────────── I64Load { offset, align: _ } => self.lower_i64_load(op, *offset)?, @@ -2548,6 +2550,530 @@ impl Selector { self.push_i64(lo, hi); Ok(()) } + + // ────────── i64 division / remainder (Phase 3) ────────── + // + // RV32IMAC's M extension only provides 32-bit `div`/`divu`/`rem`/`remu`. + // A 64-bit divide has no single instruction, so synth lowers it to an + // inline software long-division loop — the moral equivalent of + // compiler-rt's `__udivdi3` / `__divdi3` / `__umoddi3` / `__moddi3`. + // + // We pick the *inline* approach (rather than emitting a `Call` to a + // runtime helper) because synth produces self-contained bare-metal ELF + // and has no runtime-library contract; an inline loop keeps the output + // dependency-free at a modest code-size cost (~30 instructions per op). + // + // The unsigned core is `emit_i64_udiv_inline`; `lower_i64_div` is the + // single entry point for all four ops, dispatching on `signed`/`want_rem` + // and wrapping the core with sign handling for the signed variants. + + /// Fixed register file used by the inline long-division core. RV32's + /// `temps` pool is recycled round-robin with no liveness tracking, which + /// is fine for straight-line code but unsafe across a loop body where + /// many values stay live for all 64 iterations. So the long-division + /// loop instead claims a fixed, non-overlapping set of registers for its + /// exclusive use: it copies its inputs in at entry and hands its outputs + /// back via dedicated result registers, after which the caller is free + /// to move them into `alloc_temp` temporaries. + /// + /// The seven state registers (`q_lo, q_hi, r_lo, r_hi, d_lo, d_hi, cnt`) + /// plus three loop-body scratch registers are all distinct and avoid the + /// reserved `s0` (fp) / `s11` (linear-memory base). + const DIV_Q_LO: Reg = Reg::T0; + const DIV_Q_HI: Reg = Reg::T1; + const DIV_R_LO: Reg = Reg::T2; + const DIV_R_HI: Reg = Reg::T3; + const DIV_D_LO: Reg = Reg::T4; + const DIV_D_HI: Reg = Reg::T5; + const DIV_CNT: Reg = Reg::T6; + const DIV_SCRATCH0: Reg = Reg::S1; + const DIV_SCRATCH1: Reg = Reg::S2; + const DIV_SCRATCH2: Reg = Reg::S3; + /// Cycle-breaking scratch for the input parallel-move. Must be outside + /// the `temps` pool (a div input could otherwise alias it) — `s7` is + /// callee-saved and never handed out by `alloc_temp`. + const DIV_MOVE_SCRATCH: Reg = Reg::S7; + + /// Emit a set of parallel register moves `dst <- src` such that every + /// source is read before any move overwrites it — i.e. the moves behave + /// as if performed simultaneously. `scratch` is a free register used to + /// break dependency cycles (e.g. the swap `a<-b, b<-a`); the caller must + /// guarantee `scratch` is neither a source nor a destination. + /// + /// Standard parallel-copy algorithm: + /// - repeatedly emit any move whose destination is not still needed as + /// a source by a pending move (safe, no clobber); + /// - when only cycles remain, save one node's value to `scratch`, + /// rewrite the move that sourced it to read `scratch` instead, and + /// continue. A permutation of N registers needs at most N+1 moves. + /// + /// `mv` is `addi rd, rs, 0`. Trivial `dst == src` moves are skipped. + fn emit_parallel_move(&mut self, moves: &[(Reg, Reg)], scratch: Reg) { + // Working list of (dst, src) pairs still to perform. + let mut pending: Vec<(Reg, Reg)> = moves.iter().copied().filter(|(d, s)| d != s).collect(); + while !pending.is_empty() { + // Find a move whose dst is not the src of any *other* pending + // move — emitting it cannot clobber a value still needed. + let ready = pending + .iter() + .position(|&(d, _)| !pending.iter().any(|&(_, s)| s == d)); + match ready { + Some(idx) => { + let (d, s) = pending.remove(idx); + self.out.push(RiscVOp::Addi { + rd: d, + rs1: s, + imm: 0, + }); + } + None => { + // Only cycles remain. Break one: stash the first move's + // source into `scratch`, then redirect whichever pending + // move read that source to read `scratch` instead. + let (_, cyc_src) = pending[0]; + self.out.push(RiscVOp::Addi { + rd: scratch, + rs1: cyc_src, + imm: 0, + }); + for m in pending.iter_mut() { + if m.1 == cyc_src { + m.1 = scratch; + } + } + } + } + } + } + + /// Emit the unsigned 64-bit long-division core. + /// + /// Inputs `num = (num_lo, num_hi)` and `den = (den_lo, den_hi)` are + /// **unsigned** 64-bit values. The caller must already have guaranteed a + /// non-zero divisor (see `lower_i64_div`, which emits the trap). On + /// return, the quotient lives in `(DIV_Q_LO, DIV_Q_HI)` and the + /// remainder in `(DIV_R_LO, DIV_R_HI)`. + /// + /// Algorithm — classic restoring binary long division, numerator held in + /// the quotient register (the "shift the dividend out, shift the quotient + /// in" trick), 64 loop iterations: + /// ```text + /// q = num ; r = 0 ; cnt = 64 + /// loop: + /// r = (r << 1) | (q >> 63) ; shift the MSB of q into r's LSB + /// q = q << 1 ; q's LSB is now free for a quotient bit + /// if r >=u den: ; does the divisor fit? + /// r = r - den + /// q = q | 1 ; record a 1 quotient bit + /// cnt = cnt - 1 + /// if cnt != 0: goto loop + /// ``` + /// All of `<< 1`, the 64-bit unsigned compare, and the 64-bit subtract + /// are open-coded over the lo/hi register pair. + fn emit_i64_udiv_inline(&mut self, num: I64Pair, den: I64Pair) { + let (q_lo, q_hi) = (Self::DIV_Q_LO, Self::DIV_Q_HI); + let (r_lo, r_hi) = (Self::DIV_R_LO, Self::DIV_R_HI); + let (d_lo, d_hi) = (Self::DIV_D_LO, Self::DIV_D_HI); + let cnt = Self::DIV_CNT; + let (s0, s1, s2) = (Self::DIV_SCRATCH0, Self::DIV_SCRATCH1, Self::DIV_SCRATCH2); + + // ── set-up: copy inputs into the fixed state registers ─────────── + // q = num, d = den. The four inputs came from `alloc_temp` and may + // alias the fixed `DIV_*` state registers in any permutation, so a + // naive sequence of copies could clobber a source before it's read. + // `emit_parallel_move` schedules the four `mv`s so every source is + // read before it's overwritten (cycle-breaking via one scratch). + self.emit_parallel_move( + &[(q_lo, num.0), (q_hi, num.1), (d_lo, den.0), (d_hi, den.1)], + Self::DIV_MOVE_SCRATCH, + ); + // r = 0 (the running remainder accumulator). + self.out.push(RiscVOp::Addi { + rd: r_lo, + rs1: Reg::ZERO, + imm: 0, + }); + self.out.push(RiscVOp::Addi { + rd: r_hi, + rs1: Reg::ZERO, + imm: 0, + }); + // cnt = 64 (one loop pass per dividend bit). + self.out.push(RiscVOp::Addi { + rd: cnt, + rs1: Reg::ZERO, + imm: 64, + }); + + let loop_label = self.fresh_label("Ludiv_loop"); + let no_sub = self.fresh_label("Ludiv_nosub"); + self.out.push(RiscVOp::Label { + name: loop_label.clone(), + }); + + // ── r = (r << 1) | (q >> 63) ───────────────────────────────────── + // The MSB of the 64-bit quotient register becomes the new LSB of the + // remainder. `q >> 63` is just bit 31 of q_hi. + // s0 = q_hi >> 31 (the bit crossing from q into r). + self.out.push(RiscVOp::Srli { + rd: s0, + rs1: q_hi, + shamt: 31, + }); + // r_hi = (r_hi << 1) | (r_lo >> 31) — carry r_lo's MSB up into r_hi. + self.out.push(RiscVOp::Srli { + rd: s1, + rs1: r_lo, + shamt: 31, + }); + self.out.push(RiscVOp::Slli { + rd: r_hi, + rs1: r_hi, + shamt: 1, + }); + self.out.push(RiscVOp::Or { + rd: r_hi, + rs1: r_hi, + rs2: s1, + }); + // r_lo = (r_lo << 1) | s0 — shift r_lo up and drop in the bit from q. + self.out.push(RiscVOp::Slli { + rd: r_lo, + rs1: r_lo, + shamt: 1, + }); + self.out.push(RiscVOp::Or { + rd: r_lo, + rs1: r_lo, + rs2: s0, + }); + + // ── q = q << 1 ─────────────────────────────────────────────────── + // Frees q's LSB to receive a quotient bit below. + self.out.push(RiscVOp::Srli { + rd: s1, + rs1: q_lo, + shamt: 31, + }); + self.out.push(RiscVOp::Slli { + rd: q_hi, + rs1: q_hi, + shamt: 1, + }); + self.out.push(RiscVOp::Or { + rd: q_hi, + rs1: q_hi, + rs2: s1, + }); + self.out.push(RiscVOp::Slli { + rd: q_lo, + rs1: q_lo, + shamt: 1, + }); + + // ── if r >=u den: r -= den ; q |= 1 ────────────────────────────── + // 64-bit unsigned compare `r = den: 64-bit subtract r -= den with borrow. + // borrow = (r_lo Result<(), SelectorError> { + let ((nl, nh), (dl, dh)) = self.pop_pair_i64(op)?; + + // ── Trap 1: divide/remainder by zero ───────────────────────────── + // The divisor is zero iff *both* halves are zero. `or` them and + // branch when the result is non-zero. + let zero_or = self.alloc_temp(); + self.out.push(RiscVOp::Or { + rd: zero_or, + rs1: dl, + rs2: dh, + }); + let div_ok = self.fresh_label("Li64div_ok"); + self.out.push(RiscVOp::Branch { + cond: Branch::Ne, + rs1: zero_or, + rs2: Reg::ZERO, + label: div_ok.clone(), + }); + self.out.push(RiscVOp::Ebreak); + self.out.push(RiscVOp::Label { name: div_ok }); + + // ── Trap 2: signed INT64_MIN / -1 overflow (div_s only) ────────── + // INT64_MIN = 0x8000_0000_0000_0000 → lo == 0, hi == 0x8000_0000. + // -1 (64-bit) → lo == -1, hi == -1. `rem_s` does NOT trap here + // (INT64_MIN % -1 == 0), so this guard is div_s-exclusive. + if signed && !want_rem && self.options.signed_div_overflow_trap { + let sdiv_ok = self.fresh_label("Li64sdiv_ok"); + // dividend != INT64_MIN ? skip. Test the hi half against + // 0x8000_0000 and the lo half against 0. + let tmin_hi = self.alloc_temp(); + emit_load_imm(&mut self.out, tmin_hi, i32::MIN); + self.out.push(RiscVOp::Branch { + cond: Branch::Ne, + rs1: nh, + rs2: tmin_hi, + label: sdiv_ok.clone(), + }); + self.out.push(RiscVOp::Branch { + cond: Branch::Ne, + rs1: nl, + rs2: Reg::ZERO, + label: sdiv_ok.clone(), + }); + // divisor != -1 ? skip. -1 has both halves all-ones. + let neg1 = self.alloc_temp(); + self.out.push(RiscVOp::Addi { + rd: neg1, + rs1: Reg::ZERO, + imm: -1, + }); + self.out.push(RiscVOp::Branch { + cond: Branch::Ne, + rs1: dl, + rs2: neg1, + label: sdiv_ok.clone(), + }); + self.out.push(RiscVOp::Branch { + cond: Branch::Ne, + rs1: dh, + rs2: neg1, + label: sdiv_ok.clone(), + }); + // dividend == INT64_MIN and divisor == -1 → overflow, trap. + self.out.push(RiscVOp::Ebreak); + self.out.push(RiscVOp::Label { name: sdiv_ok }); + } + + if !signed { + // ── Unsigned: feed the operands straight to the core. ──────── + self.emit_i64_udiv_inline((nl, nh), (dl, dh)); + let res = self.copy_div_result(want_rem); + self.push_i64(res.0, res.1); + return Ok(()); + } + + // ── Signed: reduce to magnitudes, divide, fix up the sign. ─────── + // The sign of each operand is bit 63 = the MSB of its hi half; + // `sra hi, 31` broadcasts it to an all-ones / all-zeros mask. + let nsign = self.alloc_temp(); + self.out.push(RiscVOp::Srai { + rd: nsign, + rs1: nh, + shamt: 31, + }); + let dsign = self.alloc_temp(); + self.out.push(RiscVOp::Srai { + rd: dsign, + rs1: dh, + shamt: 31, + }); + + // |dividend| and |divisor|. Negating unconditionally then selecting + // would also work, but the operands here came from `alloc_temp`, so + // we just negate into fresh temps and branch-select. + let (nabs_lo, nabs_hi) = self.emit_i64_abs(nl, nh, nsign); + let (dabs_lo, dabs_hi) = self.emit_i64_abs(dl, dh, dsign); + + self.emit_i64_udiv_inline((nabs_lo, nabs_hi), (dabs_lo, dabs_hi)); + let (mag_lo, mag_hi) = self.copy_div_result(want_rem); + + // Result sign: + // - quotient: negative iff the operand signs differ → nsign ^ dsign + // - remainder: matches the dividend's sign → nsign + let result_sign = if want_rem { + nsign + } else { + let xs = self.alloc_temp(); + self.out.push(RiscVOp::Xor { + rd: xs, + rs1: nsign, + rs2: dsign, + }); + xs + }; + + // Conditionally negate the magnitude when result_sign is all-ones. + let (final_lo, final_hi) = self.emit_i64_apply_sign(mag_lo, mag_hi, result_sign); + self.push_i64(final_lo, final_hi); + Ok(()) + } + + /// Copy the long-division core's fixed result registers into fresh + /// `alloc_temp` temporaries — `want_rem` picks the remainder pair, else + /// the quotient pair. Done immediately after the core so the fixed + /// `DIV_*` registers can be reused by any later op. + fn copy_div_result(&mut self, want_rem: bool) -> I64Pair { + let (src_lo, src_hi) = if want_rem { + (Self::DIV_R_LO, Self::DIV_R_HI) + } else { + (Self::DIV_Q_LO, Self::DIV_Q_HI) + }; + let lo = self.alloc_temp(); + let hi = self.alloc_temp(); + self.out.push(RiscVOp::Addi { + rd: lo, + rs1: src_lo, + imm: 0, + }); + self.out.push(RiscVOp::Addi { + rd: hi, + rs1: src_hi, + imm: 0, + }); + (lo, hi) + } + + /// Absolute value of a 64-bit value, given a precomputed sign mask + /// (`sra hi, 31` — all-ones if negative). Returns `(lo, hi)` of `|x|`. + /// + /// Branchless: `abs(x) = (x ^ mask) - mask`. When `mask` is all-ones + /// this is `~x + 1 = -x`; when all-zeros it is `x` unchanged. The `- mask` + /// is a 64-bit subtract (`- (-1) = +1` carries into the hi half). + fn emit_i64_abs(&mut self, lo: Reg, hi: Reg, mask: Reg) -> I64Pair { + let xl = self.alloc_temp(); + let xh = self.alloc_temp(); + self.out.push(RiscVOp::Xor { + rd: xl, + rs1: lo, + rs2: mask, + }); + self.out.push(RiscVOp::Xor { + rd: xh, + rs1: hi, + rs2: mask, + }); + // 64-bit subtract (xl,xh) - (mask,mask), with borrow from lo to hi. + let borrow = self.alloc_temp(); + self.out.push(RiscVOp::Sltu { + rd: borrow, + rs1: xl, + rs2: mask, + }); + let res_lo = self.alloc_temp(); + let res_hi = self.alloc_temp(); + self.out.push(RiscVOp::Sub { + rd: res_lo, + rs1: xl, + rs2: mask, + }); + self.out.push(RiscVOp::Sub { + rd: res_hi, + rs1: xh, + rs2: mask, + }); + self.out.push(RiscVOp::Sub { + rd: res_hi, + rs1: res_hi, + rs2: borrow, + }); + (res_lo, res_hi) + } + + /// Conditionally negate a 64-bit value: returns `-x` when `sign` is + /// all-ones, `x` when `sign` is all-zeros. Same `(x ^ mask) - mask` + /// branchless identity as `emit_i64_abs` — reused here for the result + /// sign fix-up of signed div/rem. + fn emit_i64_apply_sign(&mut self, lo: Reg, hi: Reg, sign: Reg) -> I64Pair { + self.emit_i64_abs(lo, hi, sign) + } } #[derive(Debug, Clone, Copy)] @@ -4178,10 +4704,12 @@ mod tests { // -------- deferred ops fail loudly -------- + // -------- I64DivS / I64DivU / I64RemS / I64RemU (Phase 3) -------- + #[test] - fn i64_div_rem_are_unsupported_phase3() { - // Division/remainder on i64 are deferred to Phase 3 — they must - // surface as `Unsupported`, never silently miscompile. + fn i64_div_rem_no_longer_unsupported() { + // Phase 3 implements all four i64 div/rem ops via inline software + // long division — the `Unsupported` arm must no longer fire for them. for op in [ WasmOp::I64DivS, WasmOp::I64DivU, @@ -4193,20 +4721,232 @@ mod tests { WasmOp::I64Const(10), WasmOp::I64Const(3), op.clone(), + WasmOp::Drop, WasmOp::End, ], 0, ) .map(|_| ()); // RiscVSelection isn't Debug — drop the Ok payload. assert!( - matches!(r, Err(SelectorError::Unsupported(_))), - "{:?} should be Unsupported (Phase 3), got {:?}", + r.is_ok(), + "{:?} should now lower (Phase 3), got {:?}", op, r ); } } + /// Shared assertion: every i64 div/rem op must emit the zero-divisor + /// trap (`bne or, zero, ok ; ebreak ; ok:`) and the long-division loop + /// (a backward `Branch` to a labelled loop head, plus the inner subtract + /// guard). We pin the *shape*, not register allocation. + fn assert_i64_divrem_shape(out: &[RiscVOp]) { + // The long-division core branches/loops, so several Branch + Label + // ops appear. The zero-divisor trap contributes an Ebreak. + assert!( + count(out, |op| matches!(op, RiscVOp::Ebreak)) >= 1, + "i64 div/rem must emit a trap (ebreak) for the zero divisor" + ); + // The long-division loop body shifts via slli/srli — the core uses + // `slli/srli ..., 1` and `srli ..., 31` for the shift-in carries. + assert!( + count(out, |op| matches!(op, RiscVOp::Slli { shamt: 1, .. })) >= 1, + "long-division loop shifts the quotient/remainder by 1" + ); + assert!( + count(out, |op| matches!(op, RiscVOp::Srli { shamt: 31, .. })) >= 1, + "long-division loop extracts the crossing MSB via `>> 31`" + ); + // The loop subtracts the divisor; the core uses Sub for the 64-bit + // remainder subtraction. + assert!( + count(out, |op| matches!(op, RiscVOp::Sub { .. })) >= 1, + "long-division loop subtracts the divisor" + ); + // At least one backward branch closes the loop, plus the labels. + assert!( + count(out, |op| matches!(op, RiscVOp::Label { .. })) >= 3, + "long-division core emits a loop label + inner-branch labels" + ); + } + + #[test] + fn i64_div_u_emits_zero_trap_and_long_division() { + let out = run_i64(&[ + WasmOp::I64Const(100), + WasmOp::I64Const(7), + WasmOp::I64DivU, + WasmOp::Drop, + ]); + assert_i64_divrem_shape(&out); + // The divide-by-zero guard ORs the two divisor halves and branches + // when the OR is non-zero. + assert!( + count(&out, |op| matches!(op, RiscVOp::Or { .. })) >= 1, + "zero-divisor guard ORs the two divisor halves" + ); + } + + #[test] + fn i64_rem_u_emits_zero_trap_and_long_division() { + let out = run_i64(&[ + WasmOp::I64Const(100), + WasmOp::I64Const(7), + WasmOp::I64RemU, + WasmOp::Drop, + ]); + assert_i64_divrem_shape(&out); + } + + #[test] + fn i64_div_s_emits_zero_trap_and_long_division() { + let out = run_i64(&[ + WasmOp::I64Const(-100), + WasmOp::I64Const(7), + WasmOp::I64DivS, + WasmOp::Drop, + ]); + assert_i64_divrem_shape(&out); + // Signed div reduces operands to magnitudes via `sra hi, 31` sign + // masks, then fixes the result sign — at least two Srai-by-31. + assert!( + count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })) >= 2, + "signed div derives operand sign masks via `sra hi, 31`" + ); + } + + #[test] + fn i64_rem_s_emits_zero_trap_and_long_division() { + let out = run_i64(&[ + WasmOp::I64Const(-100), + WasmOp::I64Const(7), + WasmOp::I64RemS, + WasmOp::Drop, + ]); + assert_i64_divrem_shape(&out); + // rem_s also needs the sign masks (remainder takes the dividend sign). + assert!( + count(&out, |op| matches!(op, RiscVOp::Srai { shamt: 31, .. })) >= 2, + "signed rem derives operand sign masks via `sra hi, 31`" + ); + } + + #[test] + fn i64_div_s_emits_signed_overflow_trap() { + // INT64_MIN / -1 overflows: div_s must emit the overflow guard. The + // guard materialises INT64_MIN's high half (0x8000_0000) via + // emit_load_imm and branches against it; with the trap there are at + // least two Ebreak ops (zero-divisor + overflow). + let out = run_i64(&[ + WasmOp::I64Const(i64::MIN), + WasmOp::I64Const(-1), + WasmOp::I64DivS, + WasmOp::Drop, + ]); + assert!( + count(&out, |op| matches!(op, RiscVOp::Ebreak)) >= 2, + "div_s emits both the zero-divisor and the INT64_MIN/-1 traps" + ); + } + + #[test] + fn i64_rem_s_does_not_emit_overflow_trap() { + // INT64_MIN % -1 == 0 and must NOT trap — rem_s emits only the + // zero-divisor trap, so the Ebreak count is exactly one less than + // the equivalent div_s. We compare the two directly. + let rem_out = run_i64(&[ + WasmOp::I64Const(i64::MIN), + WasmOp::I64Const(-1), + WasmOp::I64RemS, + WasmOp::Drop, + ]); + let div_out = run_i64(&[ + WasmOp::I64Const(i64::MIN), + WasmOp::I64Const(-1), + WasmOp::I64DivS, + WasmOp::Drop, + ]); + let rem_ebreaks = count(&rem_out, |op| matches!(op, RiscVOp::Ebreak)); + let div_ebreaks = count(&div_out, |op| matches!(op, RiscVOp::Ebreak)); + assert_eq!( + rem_ebreaks + 1, + div_ebreaks, + "rem_s must omit the INT64_MIN/-1 overflow trap that div_s emits" + ); + } + + #[test] + fn i64_div_result_is_i64_typed() { + // The quotient is pushed as an i64 pair — a following i64 consumer + // (I64Add) must lower without a type mismatch. + let out = run_i64(&[ + WasmOp::I64Const(40), + WasmOp::I64Const(8), + WasmOp::I64DivU, + WasmOp::I64Const(1), + WasmOp::I64Add, + WasmOp::Drop, + ]); + // I64Add emits its add/sltu/add/add quartet → at least 3 Add ops. + assert!(count(&out, |op| matches!(op, RiscVOp::Add { .. })) >= 3); + } + + #[test] + fn parallel_move_breaks_a_swap_cycle() { + // The input copy-in for the long-division core must be alias-safe + // even when the inputs form a permutation cycle. A pure swap + // `t0<-t1, t1<-t0` needs a scratch-mediated 3-move expansion. + let mut ctx = Selector::new_with_options(0, SelectorOptions::wasm_compliant()); + ctx.emit_parallel_move(&[(Reg::T0, Reg::T1), (Reg::T1, Reg::T0)], Reg::S7); + // A 2-cycle expands to exactly 3 moves (save + 2 copies). + let mvs = count(&ctx.out, |op| matches!(op, RiscVOp::Addi { imm: 0, .. })); + assert_eq!(mvs, 3, "a swap cycle expands to 3 scratch-mediated moves"); + // The scratch register must appear as a move destination (the save). + assert!( + ctx.out + .iter() + .any(|op| matches!(op, RiscVOp::Addi { rd: Reg::S7, .. })), + "swap cycle is broken through the scratch register" + ); + } + + #[test] + fn parallel_move_skips_trivial_self_moves() { + // `dst == src` pairs are no-ops and must not be emitted at all. + let mut ctx = Selector::new_with_options(0, SelectorOptions::wasm_compliant()); + ctx.emit_parallel_move(&[(Reg::T0, Reg::T0), (Reg::T1, Reg::T2)], Reg::S7); + let mvs = count(&ctx.out, |op| matches!(op, RiscVOp::Addi { .. })); + assert_eq!(mvs, 1, "only the non-trivial move is emitted"); + } + + #[test] + fn i64_div_u_runs_64_iteration_loop() { + // Pin the long-division loop structure: the core sets the iteration + // counter to 64 (`addi cnt, zero, 64`) and decrements it (`addi cnt, + // cnt, -1`) — both immediates must appear. + let out = run_i64(&[ + WasmOp::I64Const(99), + WasmOp::I64Const(4), + WasmOp::I64DivU, + WasmOp::Drop, + ]); + assert!( + count(&out, |op| matches!( + op, + RiscVOp::Addi { + rs1: Reg::ZERO, + imm: 64, + .. + } + )) >= 1, + "long-division core initialises the loop counter to 64" + ); + assert!( + count(&out, |op| matches!(op, RiscVOp::Addi { imm: -1, .. })) >= 1, + "long-division core decrements the loop counter" + ); + } + // -------- type-state plumbing -------- #[test]