[llvm] Add a threshold to RegStackify to avoid register spills at runtime (PR #97283)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 1 04:23:46 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-webassembly

Author: Yolanda-Chen (yolanda15)

<details>
<summary>Changes</summary>

The RegStackify phase in WebAssembly codegen will move instructions and change previous instruction scheduling for  register pressure reduction.

In common cases, this will not generate deep stack thus no big issue to register allocation. With more ternary operations introduced in Relaxed SIMD (e.g.  fused multiply-add,  integer dot product) and if unrolling is enabled, it's getting more possible to generate deep stack and cause register spilling in code generation at runtime.
The [dwconv kernel](https://github.com/google/XNNPACK/blob/master/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3.c#L84) in XNNPACK is an example, where 9 fmadd intrinsics are accumulated to a single register. All inputs of these fmadd will be pushed to stack that requires at least 19 SIMD registers.

This pull request tries to add a Threshold on the register stack depth if not optimized for size and stop the stackify when exceeds the threshold.

---

Patch is 58.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97283.diff


5 Files Affected:

- (modified) llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp (+44-2) 
- (added) llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll (+79) 
- (modified) llvm/test/CodeGen/WebAssembly/simd-bitmask.ll (+69-67) 
- (modified) llvm/test/CodeGen/WebAssembly/simd-build-vector.ll (+73-73) 
- (modified) llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll (+342-342) 


``````````diff
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index e38905c20b839..bc6541680e8ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -40,6 +40,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-reg-stackify"
 
+static cl::opt<int> MaxRegStackifyDepth(
+    "webassembly-max-reg-stackify-depth", cl::init(10), cl::Hidden,
+    cl::desc("The maximum number of register uses in stack"));
+
 namespace {
 class WebAssemblyRegStackify final : public MachineFunctionPass {
   StringRef getPassName() const override {
@@ -692,12 +696,16 @@ class TreeWalkerState {
   using mop_reverse_iterator = std::reverse_iterator<mop_iterator>;
   using RangeTy = iterator_range<mop_reverse_iterator>;
   SmallVector<RangeTy, 4> Worklist;
+  int cur_stack_depth;
 
 public:
   explicit TreeWalkerState(MachineInstr *Insert) {
+    cur_stack_depth = 0;
     const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
-    if (!Range.empty())
+    if (!Range.empty()) {
       Worklist.push_back(reverse(Range));
+      cur_stack_depth = getNumRegs(Range);
+    }
   }
 
   bool done() const { return Worklist.empty(); }
@@ -706,18 +714,46 @@ class TreeWalkerState {
     RangeTy &Range = Worklist.back();
     MachineOperand &Op = *Range.begin();
     Range = drop_begin(Range);
+    if (Op.isReg())
+      cur_stack_depth--;
+
     if (Range.empty())
       Worklist.pop_back();
+
+    assert(cur_stack_depth >= 0);
     assert((Worklist.empty() || !Worklist.back().empty()) &&
            "Empty ranges shouldn't remain in the worklist");
     return Op;
   }
 
+  template <typename T> int getNumRegs(const T &Range) {
+    int num = 0;
+    for (auto it = Range.begin(); it != Range.end(); it++) {
+      if (it->isReg())
+        num++;
+    }
+    return num;
+  }
+
   /// Push Instr's operands onto the stack to be visited.
   void pushOperands(MachineInstr *Instr) {
     const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
-    if (!Range.empty())
+    if (!Range.empty()) {
       Worklist.push_back(reverse(Range));
+      cur_stack_depth += getNumRegs(Range);
+    }
+  }
+
+  bool canExceedStackDepth(MachineInstr *Instr) {
+    const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+    int expect_stack_depth = cur_stack_depth + getNumRegs(Range);
+    if (expect_stack_depth > MaxRegStackifyDepth) {
+      LLVM_DEBUG(dbgs() << "Stop stackify as the stack depth may reach "
+                        << expect_stack_depth << " and exceeds the threshold!");
+      return true;
+    }
+
+    return false;
   }
 
   /// Some of Instr's operands are on the top of the stack; remove them and
@@ -726,7 +762,9 @@ class TreeWalkerState {
     assert(hasRemainingOperands(Instr) &&
            "Reseting operands should only be done when the instruction has "
            "an operand still on the stack");
+    int remain_reg_num = getNumRegs(Worklist.back());
     Worklist.back() = reverse(Instr->explicit_uses());
+    cur_stack_depth += getNumRegs(Worklist.back()) - remain_reg_num;
   }
 
   /// Test whether Instr has operands remaining to be visited at the top of
@@ -866,6 +904,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         if (WebAssembly::isArgument(DefI->getOpcode()))
           continue;
 
+        if (!MF.getFunction().hasOptSize() &&
+            TreeWalker.canExceedStackDepth(DefI))
+          continue;
+
         MachineOperand *Def =
             DefI->findRegisterDefOperand(Reg, /*TRI=*/nullptr);
         assert(Def != nullptr);
diff --git a/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
new file mode 100644
index 0000000000000..04995b90a4470
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128,+relaxed-simd | FileCheck %s
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-wasi"
+
+define void @MADD_F32x4_a1(ptr noundef %i0, ptr noundef %i1, ptr noundef %i2, ptr noundef %i3, ptr noundef %i4, ptr noundef %i5, ptr nocapture noundef readonly %w, ptr noundef %output) local_unnamed_addr #0 {
+; CHECK-LABEL: MADD_F32x4_a1:
+; CHECK:         .functype MADD_F32x4_a1 (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 64:p2align=0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 32:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    local.set 8
+; CHECK-NEXT:    local.get 7
+; CHECK-NEXT:    local.get 5
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 192:p2align=0
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 160:p2align=0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 128:p2align=0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 6
+; CHECK-NEXT:    v128.load 96:p2align=0
+; CHECK-NEXT:    local.get 8
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    f32x4.relaxed_madd
+; CHECK-NEXT:    v128.store 0:p2align=0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %33 = load <4 x float>, ptr %w, align 1
+  %35 = load <4 x float>, ptr %i0, align 1
+  %add.ptr109 = getelementptr inbounds float, ptr %w, i32 8
+  %37 = load <4 x float>, ptr %add.ptr109, align 1
+  %39 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %35, <4 x float> %37, <4 x float> %33)
+  %41 = load <4 x float>, ptr %i1, align 1
+  %add.ptr119 = getelementptr inbounds float, ptr %w, i32 16
+  %43 = load <4 x float>, ptr %add.ptr119, align 1
+  %45 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %41, <4 x float> %43, <4 x float> %39)
+  %47 = load <4 x float>, ptr %i2, align 1
+  %add.ptr129 = getelementptr inbounds float, ptr %w, i32 24
+  %49 = load <4 x float>, ptr %add.ptr129, align 1
+  %51 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %47, <4 x float> %49, <4 x float> %45)
+  %53 = load <4 x float>, ptr %i3, align 1
+  %add.ptr139 = getelementptr inbounds float, ptr %w, i32 32
+  %55 = load <4 x float>, ptr %add.ptr139, align 1
+  %57 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %53, <4 x float> %55, <4 x float> %51)
+  %59 = load <4 x float>, ptr %i4, align 1
+  %add.ptr149 = getelementptr inbounds float, ptr %w, i32 40
+  %61 = load <4 x float>, ptr %add.ptr149, align 1
+  %63 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %59, <4 x float> %61, <4 x float> %57)
+  %65 = load <4 x float>, ptr %i5, align 1
+  %add.ptr159 = getelementptr inbounds float, ptr %w, i32 48
+  %67 = load <4 x float>, ptr %add.ptr159, align 1
+  %69 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %65, <4 x float> %67, <4 x float> %63)
+  store <4 x float> %69, ptr %output, align 1
+  ret void
+}
+
+attributes #0 = { "target-features"="+simd128,+relaxed-simd" }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
index ca160c091b229..dfd7b784e07a4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
@@ -175,16 +175,75 @@ define i8 @bitmask_v8i8(<8 x i8> %v) {
 define i32 @bitmask_v32i8(<32 x i8> %v) {
 ; CHECK-LABEL: bitmask_v32i8:
 ; CHECK:         .functype bitmask_v32i8 (v128, v128) -> (i32)
-; CHECK-NEXT:    .local v128
+; CHECK-NEXT:    .local v128, i32
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    global.get __stack_pointer
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.sub
 ; CHECK-NEXT:    drop
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    local.tee 2
 ; CHECK-NEXT:    i8x16.eq
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i8x16.extract_lane_u 7
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 23
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 6
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 22
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 5
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 21
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 4
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 20
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 3
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 19
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 2
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 18
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 1
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 17
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    local.set 3
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i8x16.eq
 ; CHECK-NEXT:    local.tee 0
 ; CHECK-NEXT:    i8x16.extract_lane_u 0
 ; CHECK-NEXT:    i32.const 1
@@ -295,109 +354,52 @@ define i32 @bitmask_v32i8(<32 x i8> %v) {
 ; CHECK-NEXT:    i32.const 65535
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i8x16.eq
-; CHECK-NEXT:    local.tee 0
 ; CHECK-NEXT:    i8x16.extract_lane_u 15
 ; CHECK-NEXT:    i32.const 31
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 14
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 30
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 13
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 29
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 12
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 28
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 11
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 27
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 10
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 26
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 9
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 25
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.extract_lane_u 8
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.const 24
 ; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 7
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 23
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 6
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 22
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 5
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 21
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 4
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 20
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 3
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 19
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 2
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 18
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 1
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 17
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 0
-; CHECK-NEXT:    i32.const 1
-; CHECK-NEXT:    i32.and
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
-; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    local.get 3
 ; CHECK-NEXT:    i32.or
 ; CHECK-NEXT:    i32.or
 ; CHECK-NEXT:    i32.or
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index a51b358de2e89..1603e5bd90434 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
 
 ; Test that the logic to choose between v128.const vector
@@ -118,92 +118,92 @@ define <16 x i8> @swizzle_all_i8x16(<16 x i8> %src, <16 x i8> %mask) {
 ; CHECK-LABEL: swizzle_all_i8x16:
 ; CHECK:         .functype swizzle_all_i8x16 (v128, v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push65=, __stack_pointer
-; CHECK-NEXT:    i32.const $push66=, 16
-; CHECK-NEXT:    i32.sub $push83=, $pop65, $pop66
-; CHECK-NEXT:    local.tee $push82=, $2=, $pop83
-; CHECK-NEXT:    v128.store 0($pop82), $0
-; CHECK-NEXT:    i8x16.extract_lane_u $push61=, $1, 15
+; CHECK-NEXT:    global.get $push63=, __stack_pointer
+; CHECK-NEXT:    i32.const $push64=, 16
+; CHECK-NEXT:    i32.sub $push81=, $pop63, $pop64
+; CHECK-NEXT:    local.tee $push80=, $3=, $pop81
+; CHECK-NEXT:    v128.store 0($pop80), $0
+; CHECK-NEXT:    i8x16.extract_lane_u $push25=, $1, 6
 ; CHECK-NEXT:    i32.const $push1=, 15
-; CHECK-NEXT:    i32.and $push62=, $pop61, $pop1
-; CHECK-NEXT:    i32.or $push63=, $2, $pop62
-; CHECK-NEXT:    i8x16.extract_lane_u $push57=, $1, 14
-; CHECK-NEXT:    i32.const $push81=, 15
-; CHECK-NEXT:    i32.and $push58=, $pop57, $pop81
-; CHECK-NEXT:    i32.or $push59=, $2, $pop58
-; CHECK-NEXT:    i8x16.extract_lane_u $push53=, $1, 13
-; CHECK-NEXT:    i32.const $push80=, 15
-; CHECK-NEXT:    i32.and $push54=, $pop53, $pop80
-; CHECK-NEXT:    i32.or $push55=, $2, $pop54
-; CHECK-NEXT:    i8x16.extract_lane_u $push49=, $1, 12
+; CHECK-NEXT:    i32.and $push26=, $pop25, $pop1
+; CHECK-NEXT:    i32.or $push27=, $3, $pop26
+; CHECK-NEXT:    i8x16.extract_lane_u $push21=, $1, 5
 ; CHECK-NEXT:    i32.const $push79=, 15
-; CHECK-NEXT:    i32.and $push50=, $pop49, $pop79
-; CHECK-NEXT:    i32.or $push51=, $2, $pop50
-; CHECK-NEXT:    i8x16.extract_lane_u $push45=, $1, 11
+; CHECK-NEXT:    i32.and $push22=, $pop21, $pop79
+; CHECK-NEXT:    i32.or $push23=, $3, $pop22
+; CHECK-NEXT:    i8x16.extract_lane_u $push17=, $1, 4
 ; CHECK-NEXT:    i32.const $push78=, 15
-; CHECK-NEXT:    i32.and $push46=, $pop45, $pop78
-; CHECK-NEXT:    i32.or $push47=, $2, $pop46
-; CHECK-NEXT:    i8x16.extract_lane_u $push41=, $1, 10
+; CHECK-NEXT:    i32.and $push18=, $pop17, $pop78
+; CHECK-NEXT:    i32.or $push19=, $3, $pop18
+; CHECK-NEXT:    i8x16.extract_lane_u $push13=, $1, 3
 ; CHECK-NEXT:    i32.const $push77=, 15
-; CHECK-NEXT:    i32.and $push42=, $pop41, $pop77
-; CHECK-NEXT:    i32.or $push43=, $2, $pop42
-; CHECK-NEXT:    i8x16.extract_lane_u $push37=, $1, 9
+; CHECK-NEXT:    i32.and $push14=, $pop13, $pop77
+; CHECK-NEXT:    i32.or $push15=, $3, $pop14
+; CHECK-NEXT:    i8x16.extract_lane_u $push9=, $1, 2
 ; CHECK-NEXT:    i32.const $push76=, 15
-; CHECK-NEXT:    i32.and $push38=, $pop37, $pop76
-; CHECK-NEXT:    i32.or $push39=, $2, $pop38
-; CHECK-NEXT:    i8x16.extract_lane_u $push33=, $1, 8
+; CHECK-NEXT:    i32.and $push10=, $pop9, $pop76
+; CHECK-NEXT:    i32.or $push11=, $3, $pop10
+; CHECK-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
 ; CHECK-NEXT:    i32.const $push75=, 15
-; CHECK-NEXT:    i32.and $push34=, $pop33, $pop75
-; CHECK-NEXT:    i32.or $push35=, $2, $pop34
-; CHECK-NEXT:    i8x16.extract_lane_u $push29=, $1, 7
+; CHECK-NEXT:    i32.and $push2=, $pop0, $pop75
+; CHECK-NEXT:    i32.or $push3=, $3, $pop2
+; CHECK-NEXT:    i8x16.extract_lane_u $push4=, $1, 0
 ; CHECK-NEXT:    i32.const $push74=, 15
-; CHECK-NEXT:    i32.and $push30=, $pop29, $pop74
-; CHECK-NEXT:    i32.or $push31=, $2, $pop30
-; CHECK-NEXT:    i8x16.extract_lane_u $push25=, $1, 6
+; CHECK-NEXT:    i32.and $push5=, $pop4, $pop74
+; CHECK-NEXT:    i32.or $push6=, $3, $pop5
+; CHECK-NEXT:    v128.load8_splat $push7=, 0($pop6)
+; CHECK-NEXT:    v128.load8_lane $push8=, 0($pop3), $pop7, 1
+; CHECK-NEXT:    v128.load8_lane $push12=, 0($pop11), $pop8, 2
+; CHECK-NEXT:    v128.load8_lane $push16=, 0($pop15), $pop12, 3
+; CHECK-NEXT:    v128.load8_lane $push20=, 0($pop19), $pop16, 4
+; CHECK-NEXT:    v128.load8_lane $push24=, 0($pop23), $pop20, 5
+; CHECK-NEXT:    v128.load8_lane $0=, 0($pop27), $pop24, 6
+; CHECK-NEXT:    i8x16.extract_lane_u $push28=, $1, 7
 ; CHECK-NEXT:    i32.const $push73=, 15
-; CHECK-NEXT:    i32.and $push26=, $pop25, $pop73
-; CHECK-NEXT:    i32.or $push27=, $2, $pop26
-; CHECK-NEXT:    i8x16.extract_lane_u $push21=, $1, 5
+; CHECK-NEXT:    i32.and $2=, $pop28, $pop73
+; CHECK-NEXT:    i8x16.extract_lane_u $push59=, $1, 15
 ; CHECK-NEXT:    i32.const $push72=, 15
-; CHECK-NEXT:    i32.and $push22=, $pop21, $pop72
-; CHECK-NEXT:    i32.or $push23=, $2, $pop22
-; CHECK-NEXT:    i8x16.extract_lane_u $push17=, $1, 4
+; CHECK-NEXT:    i32.and $push60=, $pop59, $pop72
+; CHECK-NEXT:    i32.or $push61=, $3, $pop60
+; CHECK-NEXT:    i8x16.extract_lane_u $push55=, $1, 14
 ; CHECK-NEXT:    i32.const $push71=, 15
-; CHECK-NEXT:    i32.and $push18=, $pop17, $pop71
-; CHECK-NEXT:    i32.or $push19=, $2, $pop18
-; CHECK-NEXT:    i8x16.extract_lane_u $push13=, $1, 3
+; CHECK-NEXT:    i32.and $push56=, $pop55, $pop71
+; CHECK-NEXT:    i32.or $push57=, $3, $pop56
+; CHECK-NEXT:    i8x16.extract_lane_u $push51=, $1, 13
 ; CHECK-NEXT:    i32.const $push70=, 15
-; CHECK-NEXT:    i32.and $push14=, $pop13, $pop70
-; CHECK-NEXT:    i32.or $push15=, $2, $pop14
-; CHECK-NEXT:    i8x16.extract_lane_u $push9=, $1, 2
+; CHECK-NEXT:    i32.and $push52=, $pop51, $pop70
+; CHECK-NEXT:    i32.or $push53=, $3, $pop52
+; CHECK-NEXT:    i8x16.extract_lane_u $push47=, $1, 12
 ; CHECK-NEXT:    i32.const $push69=, 15
-; CHECK-NEXT:    i32.and $push10=, $pop9, $pop69
-; CHECK-NEXT:    i32.or $push11=, $2, $pop10
-; CHECK-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
+; CHECK-NEXT:    i32.and $push48=, $pop47, $pop69
+; CHECK-NEXT:    i32.or $push49=, $3, $pop48
+; CHECK-NEXT:    i8x16.extract_lane_u $push43=, $1, 11
 ; CHECK-NEXT:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/97283


More information about the llvm-commits mailing list