diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index 165c3f20ba87..e34269662ce1 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -648,12 +648,56 @@ impl ABIMachineSpec for S390xMachineDeps { } fn gen_inline_probestack( - _insts: &mut SmallInstVec, + insts: &mut SmallInstVec, _call_conv: isa::CallConv, - _frame_size: u32, - _guard_size: u32, + frame_size: u32, + guard_size: u32, ) { - unimplemented!("Inline stack probing is unimplemented on S390x"); + // Number of probes that we need to perform + let probe_count = align_to(frame_size, guard_size) / guard_size; + + // The stack probe loop currently takes 4 instructions and each unrolled + // probe takes 2. Set this to 2 to keep the max size to 4 instructions. + const PROBE_MAX_UNROLL: u32 = 2; + + if probe_count <= PROBE_MAX_UNROLL { + // Unrolled probe loop. + for _ in 0..probe_count { + insts.extend(Self::gen_sp_reg_adjust(-(guard_size as i32))); + + insts.push(Inst::StoreImm8 { + imm: 0, + mem: MemArg::reg(stack_reg(), MemFlags::trusted()), + }); + } + } else { + // Explicit probe loop. + + // Load the number of probes into a register used as loop counter. + // `gen_inline_probestack` is called after regalloc2, so we can + // use the nonallocatable spilltmp register for this purpose. + let probe_count_reg = writable_spilltmp_reg(); + if let Ok(probe_count) = i16::try_from(probe_count) { + insts.push(Inst::Mov32SImm16 { + rd: probe_count_reg, + imm: probe_count, + }); + } else { + insts.push(Inst::Mov32Imm { + rd: probe_count_reg, + imm: probe_count, + }); + } + + // Emit probe loop. The guard size is assumed to fit in 16 bits. + insts.push(Inst::StackProbeLoop { + probe_count: probe_count_reg, + guard_size: i16::try_from(guard_size).unwrap(), + }); + } + + // Restore the stack pointer to its original position. + insts.extend(Self::gen_sp_reg_adjust((probe_count * guard_size) as i32)); } fn gen_clobber_save( diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index 451dda7a5863..342f5846fddb 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -977,6 +977,11 @@ (ridx Reg) (targets BoxVecMachLabel)) + ;; Stack probe loop sequence, as one compound instruction. + (StackProbeLoop + (probe_count WritableReg) + (guard_size i16)) + ;; Load an inline symbol reference with relocation. (LoadSymbolReloc (rd WritableReg) diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index 3947d1b098f3..eb42cc15a9ab 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -3383,6 +3383,36 @@ impl Inst { start_off = sink.cur_offset(); } + Inst::StackProbeLoop { + probe_count, + guard_size, + } => { + // Emit the loop start label + let loop_start = sink.get_label(); + sink.bind_label(loop_start, state.ctrl_plane_mut()); + + // aghi %r15, -GUARD_SIZE + let inst = Inst::AluRSImm16 { + alu_op: ALUOp::Add64, + rd: writable_stack_reg(), + ri: stack_reg(), + imm: -guard_size, + }; + inst.emit(sink, emit_info, state); + + // mvi 0(%r15), 0 + let inst = Inst::StoreImm8 { + imm: 0, + mem: MemArg::reg(stack_reg(), MemFlags::trusted()), + }; + inst.emit(sink, emit_info, state); + + // brct PROBE_COUNT, LOOP_START + let opcode = 0xa76; // BRCT + sink.use_label_at_offset(sink.cur_offset(), loop_start, LabelUse::BranchRI); + put(sink, &enc_ri_b(opcode, probe_count.to_reg(), 0)); + } + &Inst::Unwind { ref inst } => { sink.add_unwind(inst.clone()); } diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index 6c03b210b631..7a5841e13707 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -7032,6 +7032,15 @@ fn test_s390x_binemit() { "0: cr %r2, %r3 ; jgnh 1f ; cs %r4, %r5, 0(%r6) ; jglh 0b ; 1:", )); + insns.push(( + Inst::StackProbeLoop { + probe_count: writable_gpr(1), + guard_size: 4096, + }, + "A7FBF0009200F000A716FFFC", + "0: aghi %r15, -4096 ; mvi 0(%r15), 0 ; brct %r1, 0b", + )); + insns.push(( Inst::FpuMove32 { rd: writable_vr(8), diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index 9c3322143e0e..65c5d52ec044 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -226,6 +226,7 @@ impl Inst { | Inst::Debugtrap | Inst::Trap { .. } | Inst::JTSequence { .. } + | Inst::StackProbeLoop { .. } | Inst::LoadSymbolReloc { .. } | Inst::LoadAddr { .. } | Inst::Loop { .. } @@ -969,6 +970,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor { + collector.reg_early_def(probe_count); + } Inst::Loop { body, .. } => { // `reuse_def` constraints can't be permitted in a Loop instruction because the operand // index will always be relative to the Loop instruction, not the individual @@ -3286,6 +3290,14 @@ impl Inst { format!("{mem_str}{op} {rd}, {mem}") } + &Inst::StackProbeLoop { + probe_count, + guard_size, + } => { + let probe_count = pretty_print_reg(probe_count.to_reg()); + let stack_reg = pretty_print_reg(stack_reg()); + format!("0: aghi {stack_reg}, -{guard_size} ; mvi 0({stack_reg}), 0 ; brct {probe_count}, 0b") + } &Inst::Loop { ref body, cond } => { let body = body .into_iter() diff --git a/cranelift/filetests/filetests/isa/s390x/inline-probestack.clif b/cranelift/filetests/filetests/isa/s390x/inline-probestack.clif new file mode 100644 index 000000000000..4e2faaa5cabd --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/inline-probestack.clif @@ -0,0 +1,96 @@ +test compile precise-output +set enable_probestack=true +set probestack_strategy=inline +; This is the default and is equivalent to a page size of 4096 +set probestack_size_log2=12 +target s390x + + +; If the stack size is just one page, we can avoid the stack probe entirely +function %single_page() -> i64 tail { +ss0 = explicit_slot 2048 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; VCode: +; aghi %r15, -2048 +; block0: +; la %r2, 0(%r15) +; aghi %r15, 2048 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; aghi %r15, -0x800 +; block1: ; offset 0x4 +; la %r2, 0(%r15) +; aghi %r15, 0x800 +; br %r14 + +function %unrolled() -> i64 tail { +ss0 = explicit_slot 8192 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; VCode: +; aghi %r15, -4096 +; mvi 0(%r15), 0 +; aghi %r15, -4096 +; mvi 0(%r15), 0 +; aghi %r15, 8192 +; aghi %r15, -8192 +; block0: +; la %r2, 0(%r15) +; aghi %r15, 8192 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; aghi %r15, -0x1000 +; mvi 0(%r15), 0 +; aghi %r15, -0x1000 +; mvi 0(%r15), 0 +; aghi %r15, 0x2000 +; aghi %r15, -0x2000 +; block1: ; offset 0x18 +; la %r2, 0(%r15) +; aghi %r15, 0x2000 +; br %r14 + +function %large() -> i64 tail { +ss0 = explicit_slot 100000 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; VCode: +; lhi %r1, 25 +; 0: aghi %r15, -4096 ; mvi 0(%r15), 0 ; brct %r1, 0b +; agfi %r15, 102400 +; agfi %r15, -100000 +; block0: +; la %r2, 0(%r15) +; agfi %r15, 100000 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lhi %r1, 0x19 +; aghi %r15, -0x1000 +; mvi 0(%r15), 0 +; brct %r1, 4 +; agfi %r15, 0x19000 +; agfi %r15, -0x186a0 +; block1: ; offset 0x1c +; la %r2, 0(%r15) +; agfi %r15, 0x186a0 +; br %r14 + diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index 88138bec2916..f7f7951bb6bb 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -8,7 +8,6 @@ use core::str::FromStr; use serde_derive::{Deserialize, Serialize}; #[cfg(any(feature = "cache", feature = "cranelift", feature = "winch"))] use std::path::Path; -use target_lexicon::Architecture; use wasmparser::WasmFeatures; #[cfg(feature = "cache")] use wasmtime_cache::CacheConfig; @@ -2068,16 +2067,14 @@ impl Config { let target = self.compiler_target(); - // On supported targets, we enable stack probing by default. + // We enable stack probing by default on all targets. // This is required on Windows because of the way Windows // commits its stacks, but it's also a good idea on other // platforms to ensure guard pages are hit for large frame // sizes. - if probestack_supported(target.architecture) { - self.compiler_config - .flags - .insert("enable_probestack".into()); - } + self.compiler_config + .flags + .insert("enable_probestack".into()); if let Some(unwind_requested) = self.native_unwind_info { if !self @@ -3021,13 +3018,6 @@ impl PoolingAllocationConfig { } } -pub(crate) fn probestack_supported(arch: Architecture) -> bool { - matches!( - arch, - Architecture::X86_64 | Architecture::Aarch64(_) | Architecture::Riscv64(_) - ) -} - #[cfg(feature = "std")] fn detect_host_feature(feature: &str) -> Option { #[cfg(target_arch = "aarch64")] diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index e62b991b337e..c2458bdce4a9 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -306,7 +306,7 @@ impl Engine { // runtime. "libcall_call_conv" => *value == FlagValue::Enum("isa_default".into()), "preserve_frame_pointers" => *value == FlagValue::Bool(true), - "enable_probestack" => *value == FlagValue::Bool(crate::config::probestack_supported(target.architecture)), + "enable_probestack" => *value == FlagValue::Bool(true), "probestack_strategy" => *value == FlagValue::Enum("inline".into()), // Features wasmtime doesn't use should all be disabled, since