[llvm] Add a threshold to RegStackify to avoid register spills at runtime (PR #97283)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 1 04:23:11 PDT 2024
https://github.com/yolanda15 created https://github.com/llvm/llvm-project/pull/97283
The RegStackify phase in WebAssembly codegen will move instructions and change previous instruction scheduling for register pressure reduction.
In common cases, this will not generate deep stack thus no big issue to register allocation. With more ternary operations introduced in Relaxed SIMD (e.g. fused multiply-add, integer dot product) and if unrolling is enabled, it's getting more possible to generate deep stack and cause register spilling in code generation at runtime.
The [dwconv kernel](https://github.com/google/XNNPACK/blob/master/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3.c#L84) in XNNPACK is an example, where 9 fmadd intrinsics are accumulated to a single register. All inputs of these fmadd will be pushed to stack that requires at least 19 SIMD registers.
This pull request tries to add a Threshold on the register stack depth if not optimized for size and stop the stackify when exceeds the threshold.
>From 52e0eb751c263e5ea5d14b54e7c4c32a9d187315 Mon Sep 17 00:00:00 2001
From: Yolanda Chen <yolanda.chen at intel.com>
Date: Sat, 29 Jun 2024 12:48:48 +0800
Subject: [PATCH 1/2] Add threshold to RegStackify to avoid register spills at
runtime
---
.../WebAssembly/WebAssemblyRegStackify.cpp | 52 +-
.../CodeGen/WebAssembly/reg-stackify-simd.ll | 79 ++
llvm/test/CodeGen/WebAssembly/simd-bitmask.ll | 136 ++--
.../CodeGen/WebAssembly/simd-build-vector.ll | 146 ++--
.../WebAssembly/simd-vecreduce-bool.ll | 684 +++++++++---------
5 files changed, 612 insertions(+), 485 deletions(-)
create mode 100644 llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index e38905c20b839..4eea484c782e9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -40,6 +40,10 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-reg-stackify"
+static cl::opt<int> MaxRegStackifyDepth(
+ "webassembly-max-reg-stackify-depth", cl::init(10), cl::Hidden,
+ cl::desc("The maximum number of register uses in stack"));
+
namespace {
class WebAssemblyRegStackify final : public MachineFunctionPass {
StringRef getPassName() const override {
@@ -692,12 +696,16 @@ class TreeWalkerState {
using mop_reverse_iterator = std::reverse_iterator<mop_iterator>;
using RangeTy = iterator_range<mop_reverse_iterator>;
SmallVector<RangeTy, 4> Worklist;
+ int cur_stack_depth;
public:
explicit TreeWalkerState(MachineInstr *Insert) {
+ cur_stack_depth = 0;
const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
- if (!Range.empty())
+ if (!Range.empty()) {
Worklist.push_back(reverse(Range));
+ cur_stack_depth = getNumRegs(Range);
+ }
}
bool done() const { return Worklist.empty(); }
@@ -706,18 +714,47 @@ class TreeWalkerState {
RangeTy &Range = Worklist.back();
MachineOperand &Op = *Range.begin();
Range = drop_begin(Range);
- if (Range.empty())
+ if (Op.isReg()) {
+ cur_stack_depth--;
+ }
+ if (Range.empty()) {
Worklist.pop_back();
+ }
+ assert(cur_stack_depth >= 0);
assert((Worklist.empty() || !Worklist.back().empty()) &&
"Empty ranges shouldn't remain in the worklist");
return Op;
}
+ template <typename T> int getNumRegs(const T &Range) {
+ int num = 0;
+ for (auto it = Range.begin(); it != Range.end(); it++) {
+ if (it->isReg()) {
+ num++;
+ }
+ }
+ return num;
+ }
+
/// Push Instr's operands onto the stack to be visited.
void pushOperands(MachineInstr *Instr) {
const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
- if (!Range.empty())
+ if (!Range.empty()) {
Worklist.push_back(reverse(Range));
+ cur_stack_depth += getNumRegs(Range);
+ }
+ }
+
+ bool canExceedStackDepth(MachineInstr *Instr) {
+ const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+ int expect_stack_depth = cur_stack_depth + getNumRegs(Range);
+ if (expect_stack_depth > MaxRegStackifyDepth) {
+ LLVM_DEBUG(dbgs() << "Stop stackify as the stack depth may reach "
+ << expect_stack_depth << " and exceeds the threshold!");
+ return true;
+ }
+
+ return false;
}
/// Some of Instr's operands are on the top of the stack; remove them and
@@ -726,7 +763,9 @@ class TreeWalkerState {
assert(hasRemainingOperands(Instr) &&
"Reseting operands should only be done when the instruction has "
"an operand still on the stack");
+ int remain_reg_num = getNumRegs(Worklist.back());
Worklist.back() = reverse(Instr->explicit_uses());
+ cur_stack_depth += getNumRegs(Worklist.back()) - remain_reg_num;
}
/// Test whether Instr has operands remaining to be visited at the top of
@@ -786,6 +825,7 @@ class CommutingState {
Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
// Tentatively commute the operands and try again.
+ LLVM_DEBUG(dbgs() << "Commute insert\n");
TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
TreeWalker.resetTopOperands(Insert);
TentativelyCommuting = true;
@@ -837,6 +877,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// operands off the stack in LIFO order.
CommutingState Commuting;
TreeWalkerState TreeWalker(Insert);
+ LLVM_DEBUG(dbgs() << "Walk instruction"; Insert->dump());
while (!TreeWalker.done()) {
MachineOperand &Use = TreeWalker.pop();
@@ -844,6 +885,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
if (!Use.isReg())
continue;
+ // here only pop reg in stack.
Register Reg = Use.getReg();
assert(Use.isUse() && "explicit_uses() should only iterate over uses");
assert(!Use.isImplicit() &&
@@ -866,6 +908,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
if (WebAssembly::isArgument(DefI->getOpcode()))
continue;
+ if (!MF.getFunction().hasOptSize() &&
+ TreeWalker.canExceedStackDepth(DefI))
+ continue;
+
MachineOperand *Def =
DefI->findRegisterDefOperand(Reg, /*TRI=*/nullptr);
assert(Def != nullptr);
diff --git a/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
new file mode 100644
index 0000000000000..481a14fde03c9
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128,+relaxed-simd -target-abi=experimental-mv | FileCheck %s
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-wasi"
+
+define void @MADD_F32x4_a1(ptr noundef %i0, ptr noundef %i1, ptr noundef %i2, ptr noundef %i3, ptr noundef %i4, ptr noundef %i5, ptr nocapture noundef readonly %w, ptr noundef %output) local_unnamed_addr #0 {
+; CHECK-LABEL: MADD_F32x4_a1:
+; CHECK: .functype MADD_F32x4_a1 (i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 64:p2align=0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 32:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: local.set 8
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: local.get 5
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 192:p2align=0
+; CHECK-NEXT: local.get 4
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 160:p2align=0
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 128:p2align=0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: v128.load 0:p2align=0
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: v128.load 96:p2align=0
+; CHECK-NEXT: local.get 8
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: f32x4.relaxed_madd
+; CHECK-NEXT: v128.store 0:p2align=0
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %33 = load <4 x float>, ptr %w, align 1
+ %35 = load <4 x float>, ptr %i0, align 1
+ %add.ptr109 = getelementptr inbounds float, ptr %w, i32 8
+ %37 = load <4 x float>, ptr %add.ptr109, align 1
+ %39 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %35, <4 x float> %37, <4 x float> %33)
+ %41 = load <4 x float>, ptr %i1, align 1
+ %add.ptr119 = getelementptr inbounds float, ptr %w, i32 16
+ %43 = load <4 x float>, ptr %add.ptr119, align 1
+ %45 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %41, <4 x float> %43, <4 x float> %39)
+ %47 = load <4 x float>, ptr %i2, align 1
+ %add.ptr129 = getelementptr inbounds float, ptr %w, i32 24
+ %49 = load <4 x float>, ptr %add.ptr129, align 1
+ %51 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %47, <4 x float> %49, <4 x float> %45)
+ %53 = load <4 x float>, ptr %i3, align 1
+ %add.ptr139 = getelementptr inbounds float, ptr %w, i32 32
+ %55 = load <4 x float>, ptr %add.ptr139, align 1
+ %57 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %53, <4 x float> %55, <4 x float> %51)
+ %59 = load <4 x float>, ptr %i4, align 1
+ %add.ptr149 = getelementptr inbounds float, ptr %w, i32 40
+ %61 = load <4 x float>, ptr %add.ptr149, align 1
+ %63 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %59, <4 x float> %61, <4 x float> %57)
+ %65 = load <4 x float>, ptr %i5, align 1
+ %add.ptr159 = getelementptr inbounds float, ptr %w, i32 48
+ %67 = load <4 x float>, ptr %add.ptr159, align 1
+ %69 = tail call <4 x float> @llvm.wasm.relaxed.madd.v4f32(<4 x float> %65, <4 x float> %67, <4 x float> %63)
+ store <4 x float> %69, ptr %output, align 1
+ ret void
+}
+
+attributes #0 = { "target-features"="+simd128,+relaxed-simd" }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
index ca160c091b229..dfd7b784e07a4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
@@ -175,16 +175,75 @@ define i8 @bitmask_v8i8(<8 x i8> %v) {
define i32 @bitmask_v32i8(<32 x i8> %v) {
; CHECK-LABEL: bitmask_v32i8:
; CHECK: .functype bitmask_v32i8 (v128, v128) -> (i32)
-; CHECK-NEXT: .local v128
+; CHECK-NEXT: .local v128, i32
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: global.get __stack_pointer
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.sub
; CHECK-NEXT: drop
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.tee 2
; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: i8x16.extract_lane_u 7
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 23
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 6
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 22
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 5
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 21
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 4
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 20
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 3
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 19
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 2
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 18
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 1
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 17
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.extract_lane_u 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.set 3
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i8x16.eq
; CHECK-NEXT: local.tee 0
; CHECK-NEXT: i8x16.extract_lane_u 0
; CHECK-NEXT: i32.const 1
@@ -295,109 +354,52 @@ define i32 @bitmask_v32i8(<32 x i8> %v) {
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32.and
; CHECK-NEXT: local.get 1
-; CHECK-NEXT: local.get 2
-; CHECK-NEXT: i8x16.eq
-; CHECK-NEXT: local.tee 0
; CHECK-NEXT: i8x16.extract_lane_u 15
; CHECK-NEXT: i32.const 31
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 14
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 30
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 13
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 29
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 12
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 28
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 11
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 27
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 10
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 26
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 9
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 25
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.extract_lane_u 8
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 7
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 23
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 6
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 22
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 5
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 21
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 4
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 20
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 3
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 19
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 2
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 18
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 1
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 17
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.extract_lane_u 0
-; CHECK-NEXT: i32.const 1
-; CHECK-NEXT: i32.and
-; CHECK-NEXT: i32.const 16
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
-; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 3
; CHECK-NEXT: i32.or
; CHECK-NEXT: i32.or
; CHECK-NEXT: i32.or
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index a51b358de2e89..1603e5bd90434 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
; Test that the logic to choose between v128.const vector
@@ -118,92 +118,92 @@ define <16 x i8> @swizzle_all_i8x16(<16 x i8> %src, <16 x i8> %mask) {
; CHECK-LABEL: swizzle_all_i8x16:
; CHECK: .functype swizzle_all_i8x16 (v128, v128) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: global.get $push65=, __stack_pointer
-; CHECK-NEXT: i32.const $push66=, 16
-; CHECK-NEXT: i32.sub $push83=, $pop65, $pop66
-; CHECK-NEXT: local.tee $push82=, $2=, $pop83
-; CHECK-NEXT: v128.store 0($pop82), $0
-; CHECK-NEXT: i8x16.extract_lane_u $push61=, $1, 15
+; CHECK-NEXT: global.get $push63=, __stack_pointer
+; CHECK-NEXT: i32.const $push64=, 16
+; CHECK-NEXT: i32.sub $push81=, $pop63, $pop64
+; CHECK-NEXT: local.tee $push80=, $3=, $pop81
+; CHECK-NEXT: v128.store 0($pop80), $0
+; CHECK-NEXT: i8x16.extract_lane_u $push25=, $1, 6
; CHECK-NEXT: i32.const $push1=, 15
-; CHECK-NEXT: i32.and $push62=, $pop61, $pop1
-; CHECK-NEXT: i32.or $push63=, $2, $pop62
-; CHECK-NEXT: i8x16.extract_lane_u $push57=, $1, 14
-; CHECK-NEXT: i32.const $push81=, 15
-; CHECK-NEXT: i32.and $push58=, $pop57, $pop81
-; CHECK-NEXT: i32.or $push59=, $2, $pop58
-; CHECK-NEXT: i8x16.extract_lane_u $push53=, $1, 13
-; CHECK-NEXT: i32.const $push80=, 15
-; CHECK-NEXT: i32.and $push54=, $pop53, $pop80
-; CHECK-NEXT: i32.or $push55=, $2, $pop54
-; CHECK-NEXT: i8x16.extract_lane_u $push49=, $1, 12
+; CHECK-NEXT: i32.and $push26=, $pop25, $pop1
+; CHECK-NEXT: i32.or $push27=, $3, $pop26
+; CHECK-NEXT: i8x16.extract_lane_u $push21=, $1, 5
; CHECK-NEXT: i32.const $push79=, 15
-; CHECK-NEXT: i32.and $push50=, $pop49, $pop79
-; CHECK-NEXT: i32.or $push51=, $2, $pop50
-; CHECK-NEXT: i8x16.extract_lane_u $push45=, $1, 11
+; CHECK-NEXT: i32.and $push22=, $pop21, $pop79
+; CHECK-NEXT: i32.or $push23=, $3, $pop22
+; CHECK-NEXT: i8x16.extract_lane_u $push17=, $1, 4
; CHECK-NEXT: i32.const $push78=, 15
-; CHECK-NEXT: i32.and $push46=, $pop45, $pop78
-; CHECK-NEXT: i32.or $push47=, $2, $pop46
-; CHECK-NEXT: i8x16.extract_lane_u $push41=, $1, 10
+; CHECK-NEXT: i32.and $push18=, $pop17, $pop78
+; CHECK-NEXT: i32.or $push19=, $3, $pop18
+; CHECK-NEXT: i8x16.extract_lane_u $push13=, $1, 3
; CHECK-NEXT: i32.const $push77=, 15
-; CHECK-NEXT: i32.and $push42=, $pop41, $pop77
-; CHECK-NEXT: i32.or $push43=, $2, $pop42
-; CHECK-NEXT: i8x16.extract_lane_u $push37=, $1, 9
+; CHECK-NEXT: i32.and $push14=, $pop13, $pop77
+; CHECK-NEXT: i32.or $push15=, $3, $pop14
+; CHECK-NEXT: i8x16.extract_lane_u $push9=, $1, 2
; CHECK-NEXT: i32.const $push76=, 15
-; CHECK-NEXT: i32.and $push38=, $pop37, $pop76
-; CHECK-NEXT: i32.or $push39=, $2, $pop38
-; CHECK-NEXT: i8x16.extract_lane_u $push33=, $1, 8
+; CHECK-NEXT: i32.and $push10=, $pop9, $pop76
+; CHECK-NEXT: i32.or $push11=, $3, $pop10
+; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 1
; CHECK-NEXT: i32.const $push75=, 15
-; CHECK-NEXT: i32.and $push34=, $pop33, $pop75
-; CHECK-NEXT: i32.or $push35=, $2, $pop34
-; CHECK-NEXT: i8x16.extract_lane_u $push29=, $1, 7
+; CHECK-NEXT: i32.and $push2=, $pop0, $pop75
+; CHECK-NEXT: i32.or $push3=, $3, $pop2
+; CHECK-NEXT: i8x16.extract_lane_u $push4=, $1, 0
; CHECK-NEXT: i32.const $push74=, 15
-; CHECK-NEXT: i32.and $push30=, $pop29, $pop74
-; CHECK-NEXT: i32.or $push31=, $2, $pop30
-; CHECK-NEXT: i8x16.extract_lane_u $push25=, $1, 6
+; CHECK-NEXT: i32.and $push5=, $pop4, $pop74
+; CHECK-NEXT: i32.or $push6=, $3, $pop5
+; CHECK-NEXT: v128.load8_splat $push7=, 0($pop6)
+; CHECK-NEXT: v128.load8_lane $push8=, 0($pop3), $pop7, 1
+; CHECK-NEXT: v128.load8_lane $push12=, 0($pop11), $pop8, 2
+; CHECK-NEXT: v128.load8_lane $push16=, 0($pop15), $pop12, 3
+; CHECK-NEXT: v128.load8_lane $push20=, 0($pop19), $pop16, 4
+; CHECK-NEXT: v128.load8_lane $push24=, 0($pop23), $pop20, 5
+; CHECK-NEXT: v128.load8_lane $0=, 0($pop27), $pop24, 6
+; CHECK-NEXT: i8x16.extract_lane_u $push28=, $1, 7
; CHECK-NEXT: i32.const $push73=, 15
-; CHECK-NEXT: i32.and $push26=, $pop25, $pop73
-; CHECK-NEXT: i32.or $push27=, $2, $pop26
-; CHECK-NEXT: i8x16.extract_lane_u $push21=, $1, 5
+; CHECK-NEXT: i32.and $2=, $pop28, $pop73
+; CHECK-NEXT: i8x16.extract_lane_u $push59=, $1, 15
; CHECK-NEXT: i32.const $push72=, 15
-; CHECK-NEXT: i32.and $push22=, $pop21, $pop72
-; CHECK-NEXT: i32.or $push23=, $2, $pop22
-; CHECK-NEXT: i8x16.extract_lane_u $push17=, $1, 4
+; CHECK-NEXT: i32.and $push60=, $pop59, $pop72
+; CHECK-NEXT: i32.or $push61=, $3, $pop60
+; CHECK-NEXT: i8x16.extract_lane_u $push55=, $1, 14
; CHECK-NEXT: i32.const $push71=, 15
-; CHECK-NEXT: i32.and $push18=, $pop17, $pop71
-; CHECK-NEXT: i32.or $push19=, $2, $pop18
-; CHECK-NEXT: i8x16.extract_lane_u $push13=, $1, 3
+; CHECK-NEXT: i32.and $push56=, $pop55, $pop71
+; CHECK-NEXT: i32.or $push57=, $3, $pop56
+; CHECK-NEXT: i8x16.extract_lane_u $push51=, $1, 13
; CHECK-NEXT: i32.const $push70=, 15
-; CHECK-NEXT: i32.and $push14=, $pop13, $pop70
-; CHECK-NEXT: i32.or $push15=, $2, $pop14
-; CHECK-NEXT: i8x16.extract_lane_u $push9=, $1, 2
+; CHECK-NEXT: i32.and $push52=, $pop51, $pop70
+; CHECK-NEXT: i32.or $push53=, $3, $pop52
+; CHECK-NEXT: i8x16.extract_lane_u $push47=, $1, 12
; CHECK-NEXT: i32.const $push69=, 15
-; CHECK-NEXT: i32.and $push10=, $pop9, $pop69
-; CHECK-NEXT: i32.or $push11=, $2, $pop10
-; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 1
+; CHECK-NEXT: i32.and $push48=, $pop47, $pop69
+; CHECK-NEXT: i32.or $push49=, $3, $pop48
+; CHECK-NEXT: i8x16.extract_lane_u $push43=, $1, 11
; CHECK-NEXT: i32.const $push68=, 15
-; CHECK-NEXT: i32.and $push2=, $pop0, $pop68
-; CHECK-NEXT: i32.or $push3=, $2, $pop2
-; CHECK-NEXT: i8x16.extract_lane_u $push4=, $1, 0
+; CHECK-NEXT: i32.and $push44=, $pop43, $pop68
+; CHECK-NEXT: i32.or $push45=, $3, $pop44
+; CHECK-NEXT: i8x16.extract_lane_u $push39=, $1, 10
; CHECK-NEXT: i32.const $push67=, 15
-; CHECK-NEXT: i32.and $push5=, $pop4, $pop67
-; CHECK-NEXT: i32.or $push6=, $2, $pop5
-; CHECK-NEXT: v128.load8_splat $push7=, 0($pop6)
-; CHECK-NEXT: v128.load8_lane $push8=, 0($pop3), $pop7, 1
-; CHECK-NEXT: v128.load8_lane $push12=, 0($pop11), $pop8, 2
-; CHECK-NEXT: v128.load8_lane $push16=, 0($pop15), $pop12, 3
-; CHECK-NEXT: v128.load8_lane $push20=, 0($pop19), $pop16, 4
-; CHECK-NEXT: v128.load8_lane $push24=, 0($pop23), $pop20, 5
-; CHECK-NEXT: v128.load8_lane $push28=, 0($pop27), $pop24, 6
-; CHECK-NEXT: v128.load8_lane $push32=, 0($pop31), $pop28, 7
-; CHECK-NEXT: v128.load8_lane $push36=, 0($pop35), $pop32, 8
-; CHECK-NEXT: v128.load8_lane $push40=, 0($pop39), $pop36, 9
-; CHECK-NEXT: v128.load8_lane $push44=, 0($pop43), $pop40, 10
-; CHECK-NEXT: v128.load8_lane $push48=, 0($pop47), $pop44, 11
-; CHECK-NEXT: v128.load8_lane $push52=, 0($pop51), $pop48, 12
-; CHECK-NEXT: v128.load8_lane $push56=, 0($pop55), $pop52, 13
-; CHECK-NEXT: v128.load8_lane $push60=, 0($pop59), $pop56, 14
-; CHECK-NEXT: v128.load8_lane $push64=, 0($pop63), $pop60, 15
-; CHECK-NEXT: return $pop64
+; CHECK-NEXT: i32.and $push40=, $pop39, $pop67
+; CHECK-NEXT: i32.or $push41=, $3, $pop40
+; CHECK-NEXT: i8x16.extract_lane_u $push35=, $1, 9
+; CHECK-NEXT: i32.const $push66=, 15
+; CHECK-NEXT: i32.and $push36=, $pop35, $pop66
+; CHECK-NEXT: i32.or $push37=, $3, $pop36
+; CHECK-NEXT: i8x16.extract_lane_u $push31=, $1, 8
+; CHECK-NEXT: i32.const $push65=, 15
+; CHECK-NEXT: i32.and $push32=, $pop31, $pop65
+; CHECK-NEXT: i32.or $push33=, $3, $pop32
+; CHECK-NEXT: i32.or $push29=, $3, $2
+; CHECK-NEXT: v128.load8_lane $push30=, 0($pop29), $0, 7
+; CHECK-NEXT: v128.load8_lane $push34=, 0($pop33), $pop30, 8
+; CHECK-NEXT: v128.load8_lane $push38=, 0($pop37), $pop34, 9
+; CHECK-NEXT: v128.load8_lane $push42=, 0($pop41), $pop38, 10
+; CHECK-NEXT: v128.load8_lane $push46=, 0($pop45), $pop42, 11
+; CHECK-NEXT: v128.load8_lane $push50=, 0($pop49), $pop46, 12
+; CHECK-NEXT: v128.load8_lane $push54=, 0($pop53), $pop50, 13
+; CHECK-NEXT: v128.load8_lane $push58=, 0($pop57), $pop54, 14
+; CHECK-NEXT: v128.load8_lane $push62=, 0($pop61), $pop58, 15
+; CHECK-NEXT: return $pop62
%m0 = extractelement <16 x i8> %mask, i32 0
%s0 = extractelement <16 x i8> %src, i8 %m0
%v0 = insertelement <16 x i8> undef, i8 %s0, i32 0
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
index e6497bca98dc2..9e95ea3323968 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
@@ -690,389 +690,389 @@ define i1 @test_all_v64i8(<64 x i8> %x) {
; CHECK-LABEL: test_all_v64i8:
; CHECK: .functype test_all_v64i8 (v128, v128, v128, v128) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: global.get $push287=, __stack_pointer
-; CHECK-NEXT: i32.const $push288=, 16
-; CHECK-NEXT: i32.sub $drop=, $pop287, $pop288
-; CHECK-NEXT: i8x16.extract_lane_u $push220=, $2, 0
+; CHECK-NEXT: global.get $push285=, __stack_pointer
+; CHECK-NEXT: i32.const $push286=, 16
+; CHECK-NEXT: i32.sub $drop=, $pop285, $pop286
+; CHECK-NEXT: i8x16.extract_lane_u $push40=, $1, 8
; CHECK-NEXT: i32.const $push1=, 1
-; CHECK-NEXT: i32.and $push221=, $pop220, $pop1
-; CHECK-NEXT: i8x16.extract_lane_u $push222=, $2, 1
-; CHECK-NEXT: i32.const $push380=, 1
-; CHECK-NEXT: i32.and $push223=, $pop222, $pop380
-; CHECK-NEXT: i32.const $push379=, 1
-; CHECK-NEXT: i32.shl $push224=, $pop223, $pop379
-; CHECK-NEXT: i32.or $push225=, $pop221, $pop224
-; CHECK-NEXT: i8x16.extract_lane_u $push226=, $2, 2
+; CHECK-NEXT: i32.and $push41=, $pop40, $pop1
+; CHECK-NEXT: i32.const $push42=, 24
+; CHECK-NEXT: i32.shl $push43=, $pop41, $pop42
+; CHECK-NEXT: i8x16.extract_lane_u $push35=, $1, 7
; CHECK-NEXT: i32.const $push378=, 1
-; CHECK-NEXT: i32.and $push227=, $pop226, $pop378
-; CHECK-NEXT: i32.const $push87=, 2
-; CHECK-NEXT: i32.shl $push228=, $pop227, $pop87
-; CHECK-NEXT: i32.or $push229=, $pop225, $pop228
-; CHECK-NEXT: i8x16.extract_lane_u $push230=, $2, 3
+; CHECK-NEXT: i32.and $push36=, $pop35, $pop378
+; CHECK-NEXT: i32.const $push37=, 23
+; CHECK-NEXT: i32.shl $push38=, $pop36, $pop37
+; CHECK-NEXT: i8x16.extract_lane_u $push30=, $1, 6
; CHECK-NEXT: i32.const $push377=, 1
-; CHECK-NEXT: i32.and $push231=, $pop230, $pop377
-; CHECK-NEXT: i32.const $push92=, 3
-; CHECK-NEXT: i32.shl $push232=, $pop231, $pop92
-; CHECK-NEXT: i32.or $push233=, $pop229, $pop232
-; CHECK-NEXT: i8x16.extract_lane_u $push234=, $2, 4
+; CHECK-NEXT: i32.and $push31=, $pop30, $pop377
+; CHECK-NEXT: i32.const $push32=, 22
+; CHECK-NEXT: i32.shl $push33=, $pop31, $pop32
+; CHECK-NEXT: i8x16.extract_lane_u $push25=, $1, 5
; CHECK-NEXT: i32.const $push376=, 1
-; CHECK-NEXT: i32.and $push235=, $pop234, $pop376
-; CHECK-NEXT: i32.const $push97=, 4
-; CHECK-NEXT: i32.shl $push236=, $pop235, $pop97
-; CHECK-NEXT: i32.or $push237=, $pop233, $pop236
-; CHECK-NEXT: i8x16.extract_lane_u $push238=, $2, 5
+; CHECK-NEXT: i32.and $push26=, $pop25, $pop376
+; CHECK-NEXT: i32.const $push27=, 21
+; CHECK-NEXT: i32.shl $push28=, $pop26, $pop27
+; CHECK-NEXT: i8x16.extract_lane_u $push20=, $1, 4
; CHECK-NEXT: i32.const $push375=, 1
-; CHECK-NEXT: i32.and $push239=, $pop238, $pop375
-; CHECK-NEXT: i32.const $push102=, 5
-; CHECK-NEXT: i32.shl $push240=, $pop239, $pop102
-; CHECK-NEXT: i32.or $push241=, $pop237, $pop240
-; CHECK-NEXT: i8x16.extract_lane_u $push242=, $2, 6
+; CHECK-NEXT: i32.and $push21=, $pop20, $pop375
+; CHECK-NEXT: i32.const $push22=, 20
+; CHECK-NEXT: i32.shl $push23=, $pop21, $pop22
+; CHECK-NEXT: i8x16.extract_lane_u $push15=, $1, 3
; CHECK-NEXT: i32.const $push374=, 1
-; CHECK-NEXT: i32.and $push243=, $pop242, $pop374
-; CHECK-NEXT: i32.const $push107=, 6
-; CHECK-NEXT: i32.shl $push244=, $pop243, $pop107
-; CHECK-NEXT: i32.or $push245=, $pop241, $pop244
-; CHECK-NEXT: i8x16.extract_lane_u $push246=, $2, 7
+; CHECK-NEXT: i32.and $push16=, $pop15, $pop374
+; CHECK-NEXT: i32.const $push17=, 19
+; CHECK-NEXT: i32.shl $push18=, $pop16, $pop17
+; CHECK-NEXT: i8x16.extract_lane_u $push10=, $1, 2
; CHECK-NEXT: i32.const $push373=, 1
-; CHECK-NEXT: i32.and $push247=, $pop246, $pop373
-; CHECK-NEXT: i32.const $push112=, 7
-; CHECK-NEXT: i32.shl $push248=, $pop247, $pop112
-; CHECK-NEXT: i32.or $push249=, $pop245, $pop248
-; CHECK-NEXT: i8x16.extract_lane_u $push250=, $2, 8
+; CHECK-NEXT: i32.and $push11=, $pop10, $pop373
+; CHECK-NEXT: i32.const $push12=, 18
+; CHECK-NEXT: i32.shl $push13=, $pop11, $pop12
+; CHECK-NEXT: i8x16.extract_lane_u $push5=, $1, 1
; CHECK-NEXT: i32.const $push372=, 1
-; CHECK-NEXT: i32.and $push251=, $pop250, $pop372
-; CHECK-NEXT: i32.const $push117=, 8
-; CHECK-NEXT: i32.shl $push252=, $pop251, $pop117
-; CHECK-NEXT: i32.or $push253=, $pop249, $pop252
-; CHECK-NEXT: i8x16.extract_lane_u $push254=, $2, 9
+; CHECK-NEXT: i32.and $push6=, $pop5, $pop372
+; CHECK-NEXT: i32.const $push7=, 17
+; CHECK-NEXT: i32.shl $push8=, $pop6, $pop7
+; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0
; CHECK-NEXT: i32.const $push371=, 1
-; CHECK-NEXT: i32.and $push255=, $pop254, $pop371
-; CHECK-NEXT: i32.const $push122=, 9
-; CHECK-NEXT: i32.shl $push256=, $pop255, $pop122
-; CHECK-NEXT: i32.or $push257=, $pop253, $pop256
-; CHECK-NEXT: i8x16.extract_lane_u $push258=, $2, 10
+; CHECK-NEXT: i32.and $push2=, $pop0, $pop371
+; CHECK-NEXT: i32.const $push3=, 16
+; CHECK-NEXT: i32.shl $push4=, $pop2, $pop3
+; CHECK-NEXT: i32.or $push9=, $pop8, $pop4
+; CHECK-NEXT: i32.or $push14=, $pop13, $pop9
+; CHECK-NEXT: i32.or $push19=, $pop18, $pop14
+; CHECK-NEXT: i32.or $push24=, $pop23, $pop19
+; CHECK-NEXT: i32.or $push29=, $pop28, $pop24
+; CHECK-NEXT: i32.or $push34=, $pop33, $pop29
+; CHECK-NEXT: i32.or $push39=, $pop38, $pop34
+; CHECK-NEXT: i32.or $4=, $pop43, $pop39
+; CHECK-NEXT: i8x16.extract_lane_u $push184=, $3, 7
; CHECK-NEXT: i32.const $push370=, 1
-; CHECK-NEXT: i32.and $push259=, $pop258, $pop370
-; CHECK-NEXT: i32.const $push127=, 10
-; CHECK-NEXT: i32.shl $push260=, $pop259, $pop127
-; CHECK-NEXT: i32.or $push261=, $pop257, $pop260
-; CHECK-NEXT: i8x16.extract_lane_u $push262=, $2, 11
-; CHECK-NEXT: i32.const $push369=, 1
-; CHECK-NEXT: i32.and $push263=, $pop262, $pop369
-; CHECK-NEXT: i32.const $push132=, 11
-; CHECK-NEXT: i32.shl $push264=, $pop263, $pop132
-; CHECK-NEXT: i32.or $push265=, $pop261, $pop264
-; CHECK-NEXT: i8x16.extract_lane_u $push266=, $2, 12
+; CHECK-NEXT: i32.and $push185=, $pop184, $pop370
+; CHECK-NEXT: i32.const $push369=, 23
+; CHECK-NEXT: i32.shl $push186=, $pop185, $pop369
+; CHECK-NEXT: i8x16.extract_lane_u $push180=, $3, 6
; CHECK-NEXT: i32.const $push368=, 1
-; CHECK-NEXT: i32.and $push267=, $pop266, $pop368
-; CHECK-NEXT: i32.const $push137=, 12
-; CHECK-NEXT: i32.shl $push268=, $pop267, $pop137
-; CHECK-NEXT: i32.or $push269=, $pop265, $pop268
-; CHECK-NEXT: i8x16.extract_lane_u $push270=, $2, 13
-; CHECK-NEXT: i32.const $push367=, 1
-; CHECK-NEXT: i32.and $push271=, $pop270, $pop367
-; CHECK-NEXT: i32.const $push142=, 13
-; CHECK-NEXT: i32.shl $push272=, $pop271, $pop142
-; CHECK-NEXT: i32.or $push273=, $pop269, $pop272
-; CHECK-NEXT: i8x16.extract_lane_u $push274=, $2, 14
+; CHECK-NEXT: i32.and $push181=, $pop180, $pop368
+; CHECK-NEXT: i32.const $push367=, 22
+; CHECK-NEXT: i32.shl $push182=, $pop181, $pop367
+; CHECK-NEXT: i8x16.extract_lane_u $push176=, $3, 5
; CHECK-NEXT: i32.const $push366=, 1
-; CHECK-NEXT: i32.and $push275=, $pop274, $pop366
-; CHECK-NEXT: i32.const $push147=, 14
-; CHECK-NEXT: i32.shl $push276=, $pop275, $pop147
-; CHECK-NEXT: i32.or $push277=, $pop273, $pop276
-; CHECK-NEXT: i8x16.extract_lane_u $push278=, $2, 15
-; CHECK-NEXT: i32.const $push151=, 15
-; CHECK-NEXT: i32.shl $push279=, $pop278, $pop151
-; CHECK-NEXT: i32.or $push280=, $pop277, $pop279
-; CHECK-NEXT: i32.const $push154=, 65535
-; CHECK-NEXT: i32.and $push281=, $pop280, $pop154
-; CHECK-NEXT: i8x16.extract_lane_u $push217=, $3, 15
-; CHECK-NEXT: i32.const $push76=, 31
-; CHECK-NEXT: i32.shl $push218=, $pop217, $pop76
-; CHECK-NEXT: i8x16.extract_lane_u $push213=, $3, 14
-; CHECK-NEXT: i32.const $push365=, 1
-; CHECK-NEXT: i32.and $push214=, $pop213, $pop365
-; CHECK-NEXT: i32.const $push72=, 30
-; CHECK-NEXT: i32.shl $push215=, $pop214, $pop72
-; CHECK-NEXT: i8x16.extract_lane_u $push209=, $3, 13
+; CHECK-NEXT: i32.and $push177=, $pop176, $pop366
+; CHECK-NEXT: i32.const $push365=, 21
+; CHECK-NEXT: i32.shl $push178=, $pop177, $pop365
+; CHECK-NEXT: i8x16.extract_lane_u $push172=, $3, 4
; CHECK-NEXT: i32.const $push364=, 1
-; CHECK-NEXT: i32.and $push210=, $pop209, $pop364
-; CHECK-NEXT: i32.const $push67=, 29
-; CHECK-NEXT: i32.shl $push211=, $pop210, $pop67
-; CHECK-NEXT: i8x16.extract_lane_u $push205=, $3, 12
-; CHECK-NEXT: i32.const $push363=, 1
-; CHECK-NEXT: i32.and $push206=, $pop205, $pop363
-; CHECK-NEXT: i32.const $push62=, 28
-; CHECK-NEXT: i32.shl $push207=, $pop206, $pop62
-; CHECK-NEXT: i8x16.extract_lane_u $push201=, $3, 11
+; CHECK-NEXT: i32.and $push173=, $pop172, $pop364
+; CHECK-NEXT: i32.const $push363=, 20
+; CHECK-NEXT: i32.shl $push174=, $pop173, $pop363
+; CHECK-NEXT: i8x16.extract_lane_u $push168=, $3, 3
; CHECK-NEXT: i32.const $push362=, 1
-; CHECK-NEXT: i32.and $push202=, $pop201, $pop362
-; CHECK-NEXT: i32.const $push57=, 27
-; CHECK-NEXT: i32.shl $push203=, $pop202, $pop57
-; CHECK-NEXT: i8x16.extract_lane_u $push197=, $3, 10
-; CHECK-NEXT: i32.const $push361=, 1
-; CHECK-NEXT: i32.and $push198=, $pop197, $pop361
-; CHECK-NEXT: i32.const $push52=, 26
-; CHECK-NEXT: i32.shl $push199=, $pop198, $pop52
-; CHECK-NEXT: i8x16.extract_lane_u $push193=, $3, 9
+; CHECK-NEXT: i32.and $push169=, $pop168, $pop362
+; CHECK-NEXT: i32.const $push361=, 19
+; CHECK-NEXT: i32.shl $push170=, $pop169, $pop361
+; CHECK-NEXT: i8x16.extract_lane_u $push164=, $3, 2
; CHECK-NEXT: i32.const $push360=, 1
-; CHECK-NEXT: i32.and $push194=, $pop193, $pop360
-; CHECK-NEXT: i32.const $push47=, 25
-; CHECK-NEXT: i32.shl $push195=, $pop194, $pop47
-; CHECK-NEXT: i8x16.extract_lane_u $push189=, $3, 8
-; CHECK-NEXT: i32.const $push359=, 1
-; CHECK-NEXT: i32.and $push190=, $pop189, $pop359
-; CHECK-NEXT: i32.const $push42=, 24
-; CHECK-NEXT: i32.shl $push191=, $pop190, $pop42
-; CHECK-NEXT: i8x16.extract_lane_u $push185=, $3, 7
+; CHECK-NEXT: i32.and $push165=, $pop164, $pop360
+; CHECK-NEXT: i32.const $push359=, 18
+; CHECK-NEXT: i32.shl $push166=, $pop165, $pop359
+; CHECK-NEXT: i8x16.extract_lane_u $push160=, $3, 1
; CHECK-NEXT: i32.const $push358=, 1
-; CHECK-NEXT: i32.and $push186=, $pop185, $pop358
-; CHECK-NEXT: i32.const $push37=, 23
-; CHECK-NEXT: i32.shl $push187=, $pop186, $pop37
-; CHECK-NEXT: i8x16.extract_lane_u $push181=, $3, 6
-; CHECK-NEXT: i32.const $push357=, 1
-; CHECK-NEXT: i32.and $push182=, $pop181, $pop357
-; CHECK-NEXT: i32.const $push32=, 22
-; CHECK-NEXT: i32.shl $push183=, $pop182, $pop32
-; CHECK-NEXT: i8x16.extract_lane_u $push177=, $3, 5
+; CHECK-NEXT: i32.and $push161=, $pop160, $pop358
+; CHECK-NEXT: i32.const $push357=, 17
+; CHECK-NEXT: i32.shl $push162=, $pop161, $pop357
+; CHECK-NEXT: i8x16.extract_lane_u $push157=, $3, 0
; CHECK-NEXT: i32.const $push356=, 1
-; CHECK-NEXT: i32.and $push178=, $pop177, $pop356
-; CHECK-NEXT: i32.const $push27=, 21
-; CHECK-NEXT: i32.shl $push179=, $pop178, $pop27
-; CHECK-NEXT: i8x16.extract_lane_u $push173=, $3, 4
-; CHECK-NEXT: i32.const $push355=, 1
-; CHECK-NEXT: i32.and $push174=, $pop173, $pop355
-; CHECK-NEXT: i32.const $push22=, 20
-; CHECK-NEXT: i32.shl $push175=, $pop174, $pop22
-; CHECK-NEXT: i8x16.extract_lane_u $push169=, $3, 3
+; CHECK-NEXT: i32.and $push158=, $pop157, $pop356
+; CHECK-NEXT: i32.const $push355=, 16
+; CHECK-NEXT: i32.shl $push159=, $pop158, $pop355
+; CHECK-NEXT: i32.or $push163=, $pop162, $pop159
+; CHECK-NEXT: i32.or $push167=, $pop166, $pop163
+; CHECK-NEXT: i32.or $push171=, $pop170, $pop167
+; CHECK-NEXT: i32.or $push175=, $pop174, $pop171
+; CHECK-NEXT: i32.or $push179=, $pop178, $pop175
+; CHECK-NEXT: i32.or $push183=, $pop182, $pop179
+; CHECK-NEXT: i32.or $5=, $pop186, $pop183
+; CHECK-NEXT: i8x16.extract_lane_u $push218=, $2, 0
; CHECK-NEXT: i32.const $push354=, 1
-; CHECK-NEXT: i32.and $push170=, $pop169, $pop354
-; CHECK-NEXT: i32.const $push17=, 19
-; CHECK-NEXT: i32.shl $push171=, $pop170, $pop17
-; CHECK-NEXT: i8x16.extract_lane_u $push165=, $3, 2
+; CHECK-NEXT: i32.and $push219=, $pop218, $pop354
+; CHECK-NEXT: i8x16.extract_lane_u $push220=, $2, 1
; CHECK-NEXT: i32.const $push353=, 1
-; CHECK-NEXT: i32.and $push166=, $pop165, $pop353
-; CHECK-NEXT: i32.const $push12=, 18
-; CHECK-NEXT: i32.shl $push167=, $pop166, $pop12
-; CHECK-NEXT: i8x16.extract_lane_u $push161=, $3, 1
+; CHECK-NEXT: i32.and $push221=, $pop220, $pop353
; CHECK-NEXT: i32.const $push352=, 1
-; CHECK-NEXT: i32.and $push162=, $pop161, $pop352
-; CHECK-NEXT: i32.const $push7=, 17
-; CHECK-NEXT: i32.shl $push163=, $pop162, $pop7
-; CHECK-NEXT: i8x16.extract_lane_u $push158=, $3, 0
+; CHECK-NEXT: i32.shl $push222=, $pop221, $pop352
+; CHECK-NEXT: i32.or $push223=, $pop219, $pop222
+; CHECK-NEXT: i8x16.extract_lane_u $push224=, $2, 2
; CHECK-NEXT: i32.const $push351=, 1
-; CHECK-NEXT: i32.and $push159=, $pop158, $pop351
-; CHECK-NEXT: i32.const $push3=, 16
-; CHECK-NEXT: i32.shl $push160=, $pop159, $pop3
-; CHECK-NEXT: i32.or $push164=, $pop163, $pop160
-; CHECK-NEXT: i32.or $push168=, $pop167, $pop164
-; CHECK-NEXT: i32.or $push172=, $pop171, $pop168
-; CHECK-NEXT: i32.or $push176=, $pop175, $pop172
-; CHECK-NEXT: i32.or $push180=, $pop179, $pop176
-; CHECK-NEXT: i32.or $push184=, $pop183, $pop180
-; CHECK-NEXT: i32.or $push188=, $pop187, $pop184
-; CHECK-NEXT: i32.or $push192=, $pop191, $pop188
-; CHECK-NEXT: i32.or $push196=, $pop195, $pop192
-; CHECK-NEXT: i32.or $push200=, $pop199, $pop196
-; CHECK-NEXT: i32.or $push204=, $pop203, $pop200
-; CHECK-NEXT: i32.or $push208=, $pop207, $pop204
-; CHECK-NEXT: i32.or $push212=, $pop211, $pop208
-; CHECK-NEXT: i32.or $push216=, $pop215, $pop212
-; CHECK-NEXT: i32.or $push219=, $pop218, $pop216
-; CHECK-NEXT: i32.or $push282=, $pop281, $pop219
-; CHECK-NEXT: i64.extend_i32_u $push283=, $pop282
-; CHECK-NEXT: i8x16.extract_lane_u $push79=, $0, 0
+; CHECK-NEXT: i32.and $push225=, $pop224, $pop351
+; CHECK-NEXT: i32.const $push86=, 2
+; CHECK-NEXT: i32.shl $push226=, $pop225, $pop86
+; CHECK-NEXT: i32.or $push227=, $pop223, $pop226
+; CHECK-NEXT: i8x16.extract_lane_u $push228=, $2, 3
; CHECK-NEXT: i32.const $push350=, 1
-; CHECK-NEXT: i32.and $push80=, $pop79, $pop350
-; CHECK-NEXT: i8x16.extract_lane_u $push81=, $0, 1
+; CHECK-NEXT: i32.and $push229=, $pop228, $pop350
+; CHECK-NEXT: i32.const $push91=, 3
+; CHECK-NEXT: i32.shl $push230=, $pop229, $pop91
+; CHECK-NEXT: i32.or $push231=, $pop227, $pop230
+; CHECK-NEXT: i8x16.extract_lane_u $push232=, $2, 4
; CHECK-NEXT: i32.const $push349=, 1
-; CHECK-NEXT: i32.and $push82=, $pop81, $pop349
+; CHECK-NEXT: i32.and $push233=, $pop232, $pop349
+; CHECK-NEXT: i32.const $push96=, 4
+; CHECK-NEXT: i32.shl $push234=, $pop233, $pop96
+; CHECK-NEXT: i32.or $push235=, $pop231, $pop234
+; CHECK-NEXT: i8x16.extract_lane_u $push236=, $2, 5
; CHECK-NEXT: i32.const $push348=, 1
-; CHECK-NEXT: i32.shl $push83=, $pop82, $pop348
-; CHECK-NEXT: i32.or $push84=, $pop80, $pop83
-; CHECK-NEXT: i8x16.extract_lane_u $push85=, $0, 2
+; CHECK-NEXT: i32.and $push237=, $pop236, $pop348
+; CHECK-NEXT: i32.const $push101=, 5
+; CHECK-NEXT: i32.shl $push238=, $pop237, $pop101
+; CHECK-NEXT: i32.or $push239=, $pop235, $pop238
+; CHECK-NEXT: i8x16.extract_lane_u $push240=, $2, 6
; CHECK-NEXT: i32.const $push347=, 1
-; CHECK-NEXT: i32.and $push86=, $pop85, $pop347
-; CHECK-NEXT: i32.const $push346=, 2
-; CHECK-NEXT: i32.shl $push88=, $pop86, $pop346
-; CHECK-NEXT: i32.or $push89=, $pop84, $pop88
-; CHECK-NEXT: i8x16.extract_lane_u $push90=, $0, 3
+; CHECK-NEXT: i32.and $push241=, $pop240, $pop347
+; CHECK-NEXT: i32.const $push106=, 6
+; CHECK-NEXT: i32.shl $push242=, $pop241, $pop106
+; CHECK-NEXT: i32.or $push243=, $pop239, $pop242
+; CHECK-NEXT: i8x16.extract_lane_u $push244=, $2, 7
+; CHECK-NEXT: i32.const $push346=, 1
+; CHECK-NEXT: i32.and $push245=, $pop244, $pop346
+; CHECK-NEXT: i32.const $push111=, 7
+; CHECK-NEXT: i32.shl $push246=, $pop245, $pop111
+; CHECK-NEXT: i32.or $push247=, $pop243, $pop246
+; CHECK-NEXT: i8x16.extract_lane_u $push248=, $2, 8
; CHECK-NEXT: i32.const $push345=, 1
-; CHECK-NEXT: i32.and $push91=, $pop90, $pop345
-; CHECK-NEXT: i32.const $push344=, 3
-; CHECK-NEXT: i32.shl $push93=, $pop91, $pop344
-; CHECK-NEXT: i32.or $push94=, $pop89, $pop93
-; CHECK-NEXT: i8x16.extract_lane_u $push95=, $0, 4
+; CHECK-NEXT: i32.and $push249=, $pop248, $pop345
+; CHECK-NEXT: i32.const $push116=, 8
+; CHECK-NEXT: i32.shl $push250=, $pop249, $pop116
+; CHECK-NEXT: i32.or $push251=, $pop247, $pop250
+; CHECK-NEXT: i8x16.extract_lane_u $push252=, $2, 9
+; CHECK-NEXT: i32.const $push344=, 1
+; CHECK-NEXT: i32.and $push253=, $pop252, $pop344
+; CHECK-NEXT: i32.const $push121=, 9
+; CHECK-NEXT: i32.shl $push254=, $pop253, $pop121
+; CHECK-NEXT: i32.or $push255=, $pop251, $pop254
+; CHECK-NEXT: i8x16.extract_lane_u $push256=, $2, 10
; CHECK-NEXT: i32.const $push343=, 1
-; CHECK-NEXT: i32.and $push96=, $pop95, $pop343
-; CHECK-NEXT: i32.const $push342=, 4
-; CHECK-NEXT: i32.shl $push98=, $pop96, $pop342
-; CHECK-NEXT: i32.or $push99=, $pop94, $pop98
-; CHECK-NEXT: i8x16.extract_lane_u $push100=, $0, 5
+; CHECK-NEXT: i32.and $push257=, $pop256, $pop343
+; CHECK-NEXT: i32.const $push126=, 10
+; CHECK-NEXT: i32.shl $push258=, $pop257, $pop126
+; CHECK-NEXT: i32.or $push259=, $pop255, $pop258
+; CHECK-NEXT: i8x16.extract_lane_u $push260=, $2, 11
+; CHECK-NEXT: i32.const $push342=, 1
+; CHECK-NEXT: i32.and $push261=, $pop260, $pop342
+; CHECK-NEXT: i32.const $push131=, 11
+; CHECK-NEXT: i32.shl $push262=, $pop261, $pop131
+; CHECK-NEXT: i32.or $push263=, $pop259, $pop262
+; CHECK-NEXT: i8x16.extract_lane_u $push264=, $2, 12
; CHECK-NEXT: i32.const $push341=, 1
-; CHECK-NEXT: i32.and $push101=, $pop100, $pop341
-; CHECK-NEXT: i32.const $push340=, 5
-; CHECK-NEXT: i32.shl $push103=, $pop101, $pop340
-; CHECK-NEXT: i32.or $push104=, $pop99, $pop103
-; CHECK-NEXT: i8x16.extract_lane_u $push105=, $0, 6
+; CHECK-NEXT: i32.and $push265=, $pop264, $pop341
+; CHECK-NEXT: i32.const $push136=, 12
+; CHECK-NEXT: i32.shl $push266=, $pop265, $pop136
+; CHECK-NEXT: i32.or $push267=, $pop263, $pop266
+; CHECK-NEXT: i8x16.extract_lane_u $push268=, $2, 13
+; CHECK-NEXT: i32.const $push340=, 1
+; CHECK-NEXT: i32.and $push269=, $pop268, $pop340
+; CHECK-NEXT: i32.const $push141=, 13
+; CHECK-NEXT: i32.shl $push270=, $pop269, $pop141
+; CHECK-NEXT: i32.or $push271=, $pop267, $pop270
+; CHECK-NEXT: i8x16.extract_lane_u $push272=, $2, 14
; CHECK-NEXT: i32.const $push339=, 1
-; CHECK-NEXT: i32.and $push106=, $pop105, $pop339
-; CHECK-NEXT: i32.const $push338=, 6
-; CHECK-NEXT: i32.shl $push108=, $pop106, $pop338
-; CHECK-NEXT: i32.or $push109=, $pop104, $pop108
-; CHECK-NEXT: i8x16.extract_lane_u $push110=, $0, 7
+; CHECK-NEXT: i32.and $push273=, $pop272, $pop339
+; CHECK-NEXT: i32.const $push146=, 14
+; CHECK-NEXT: i32.shl $push274=, $pop273, $pop146
+; CHECK-NEXT: i32.or $push275=, $pop271, $pop274
+; CHECK-NEXT: i8x16.extract_lane_u $push276=, $2, 15
+; CHECK-NEXT: i32.const $push150=, 15
+; CHECK-NEXT: i32.shl $push277=, $pop276, $pop150
+; CHECK-NEXT: i32.or $push278=, $pop275, $pop277
+; CHECK-NEXT: i32.const $push153=, 65535
+; CHECK-NEXT: i32.and $push279=, $pop278, $pop153
+; CHECK-NEXT: i8x16.extract_lane_u $push215=, $3, 15
+; CHECK-NEXT: i32.const $push75=, 31
+; CHECK-NEXT: i32.shl $push216=, $pop215, $pop75
+; CHECK-NEXT: i8x16.extract_lane_u $push211=, $3, 14
+; CHECK-NEXT: i32.const $push338=, 1
+; CHECK-NEXT: i32.and $push212=, $pop211, $pop338
+; CHECK-NEXT: i32.const $push71=, 30
+; CHECK-NEXT: i32.shl $push213=, $pop212, $pop71
+; CHECK-NEXT: i8x16.extract_lane_u $push207=, $3, 13
; CHECK-NEXT: i32.const $push337=, 1
-; CHECK-NEXT: i32.and $push111=, $pop110, $pop337
-; CHECK-NEXT: i32.const $push336=, 7
-; CHECK-NEXT: i32.shl $push113=, $pop111, $pop336
-; CHECK-NEXT: i32.or $push114=, $pop109, $pop113
-; CHECK-NEXT: i8x16.extract_lane_u $push115=, $0, 8
+; CHECK-NEXT: i32.and $push208=, $pop207, $pop337
+; CHECK-NEXT: i32.const $push66=, 29
+; CHECK-NEXT: i32.shl $push209=, $pop208, $pop66
+; CHECK-NEXT: i8x16.extract_lane_u $push203=, $3, 12
+; CHECK-NEXT: i32.const $push336=, 1
+; CHECK-NEXT: i32.and $push204=, $pop203, $pop336
+; CHECK-NEXT: i32.const $push61=, 28
+; CHECK-NEXT: i32.shl $push205=, $pop204, $pop61
+; CHECK-NEXT: i8x16.extract_lane_u $push199=, $3, 11
; CHECK-NEXT: i32.const $push335=, 1
-; CHECK-NEXT: i32.and $push116=, $pop115, $pop335
-; CHECK-NEXT: i32.const $push334=, 8
-; CHECK-NEXT: i32.shl $push118=, $pop116, $pop334
-; CHECK-NEXT: i32.or $push119=, $pop114, $pop118
-; CHECK-NEXT: i8x16.extract_lane_u $push120=, $0, 9
+; CHECK-NEXT: i32.and $push200=, $pop199, $pop335
+; CHECK-NEXT: i32.const $push56=, 27
+; CHECK-NEXT: i32.shl $push201=, $pop200, $pop56
+; CHECK-NEXT: i8x16.extract_lane_u $push195=, $3, 10
+; CHECK-NEXT: i32.const $push334=, 1
+; CHECK-NEXT: i32.and $push196=, $pop195, $pop334
+; CHECK-NEXT: i32.const $push51=, 26
+; CHECK-NEXT: i32.shl $push197=, $pop196, $pop51
+; CHECK-NEXT: i8x16.extract_lane_u $push191=, $3, 9
; CHECK-NEXT: i32.const $push333=, 1
-; CHECK-NEXT: i32.and $push121=, $pop120, $pop333
-; CHECK-NEXT: i32.const $push332=, 9
-; CHECK-NEXT: i32.shl $push123=, $pop121, $pop332
-; CHECK-NEXT: i32.or $push124=, $pop119, $pop123
-; CHECK-NEXT: i8x16.extract_lane_u $push125=, $0, 10
-; CHECK-NEXT: i32.const $push331=, 1
-; CHECK-NEXT: i32.and $push126=, $pop125, $pop331
-; CHECK-NEXT: i32.const $push330=, 10
-; CHECK-NEXT: i32.shl $push128=, $pop126, $pop330
-; CHECK-NEXT: i32.or $push129=, $pop124, $pop128
-; CHECK-NEXT: i8x16.extract_lane_u $push130=, $0, 11
+; CHECK-NEXT: i32.and $push192=, $pop191, $pop333
+; CHECK-NEXT: i32.const $push46=, 25
+; CHECK-NEXT: i32.shl $push193=, $pop192, $pop46
+; CHECK-NEXT: i8x16.extract_lane_u $push187=, $3, 8
+; CHECK-NEXT: i32.const $push332=, 1
+; CHECK-NEXT: i32.and $push188=, $pop187, $pop332
+; CHECK-NEXT: i32.const $push331=, 24
+; CHECK-NEXT: i32.shl $push189=, $pop188, $pop331
+; CHECK-NEXT: i32.or $push190=, $pop189, $5
+; CHECK-NEXT: i32.or $push194=, $pop193, $pop190
+; CHECK-NEXT: i32.or $push198=, $pop197, $pop194
+; CHECK-NEXT: i32.or $push202=, $pop201, $pop198
+; CHECK-NEXT: i32.or $push206=, $pop205, $pop202
+; CHECK-NEXT: i32.or $push210=, $pop209, $pop206
+; CHECK-NEXT: i32.or $push214=, $pop213, $pop210
+; CHECK-NEXT: i32.or $push217=, $pop216, $pop214
+; CHECK-NEXT: i32.or $push280=, $pop279, $pop217
+; CHECK-NEXT: i64.extend_i32_u $push281=, $pop280
+; CHECK-NEXT: i8x16.extract_lane_u $push78=, $0, 0
+; CHECK-NEXT: i32.const $push330=, 1
+; CHECK-NEXT: i32.and $push79=, $pop78, $pop330
+; CHECK-NEXT: i8x16.extract_lane_u $push80=, $0, 1
; CHECK-NEXT: i32.const $push329=, 1
-; CHECK-NEXT: i32.and $push131=, $pop130, $pop329
-; CHECK-NEXT: i32.const $push328=, 11
-; CHECK-NEXT: i32.shl $push133=, $pop131, $pop328
-; CHECK-NEXT: i32.or $push134=, $pop129, $pop133
-; CHECK-NEXT: i8x16.extract_lane_u $push135=, $0, 12
+; CHECK-NEXT: i32.and $push81=, $pop80, $pop329
+; CHECK-NEXT: i32.const $push328=, 1
+; CHECK-NEXT: i32.shl $push82=, $pop81, $pop328
+; CHECK-NEXT: i32.or $push83=, $pop79, $pop82
+; CHECK-NEXT: i8x16.extract_lane_u $push84=, $0, 2
; CHECK-NEXT: i32.const $push327=, 1
-; CHECK-NEXT: i32.and $push136=, $pop135, $pop327
-; CHECK-NEXT: i32.const $push326=, 12
-; CHECK-NEXT: i32.shl $push138=, $pop136, $pop326
-; CHECK-NEXT: i32.or $push139=, $pop134, $pop138
-; CHECK-NEXT: i8x16.extract_lane_u $push140=, $0, 13
+; CHECK-NEXT: i32.and $push85=, $pop84, $pop327
+; CHECK-NEXT: i32.const $push326=, 2
+; CHECK-NEXT: i32.shl $push87=, $pop85, $pop326
+; CHECK-NEXT: i32.or $push88=, $pop83, $pop87
+; CHECK-NEXT: i8x16.extract_lane_u $push89=, $0, 3
; CHECK-NEXT: i32.const $push325=, 1
-; CHECK-NEXT: i32.and $push141=, $pop140, $pop325
-; CHECK-NEXT: i32.const $push324=, 13
-; CHECK-NEXT: i32.shl $push143=, $pop141, $pop324
-; CHECK-NEXT: i32.or $push144=, $pop139, $pop143
-; CHECK-NEXT: i8x16.extract_lane_u $push145=, $0, 14
+; CHECK-NEXT: i32.and $push90=, $pop89, $pop325
+; CHECK-NEXT: i32.const $push324=, 3
+; CHECK-NEXT: i32.shl $push92=, $pop90, $pop324
+; CHECK-NEXT: i32.or $push93=, $pop88, $pop92
+; CHECK-NEXT: i8x16.extract_lane_u $push94=, $0, 4
; CHECK-NEXT: i32.const $push323=, 1
-; CHECK-NEXT: i32.and $push146=, $pop145, $pop323
-; CHECK-NEXT: i32.const $push322=, 14
-; CHECK-NEXT: i32.shl $push148=, $pop146, $pop322
-; CHECK-NEXT: i32.or $push149=, $pop144, $pop148
-; CHECK-NEXT: i8x16.extract_lane_u $push150=, $0, 15
-; CHECK-NEXT: i32.const $push321=, 15
-; CHECK-NEXT: i32.shl $push152=, $pop150, $pop321
-; CHECK-NEXT: i32.or $push153=, $pop149, $pop152
-; CHECK-NEXT: i32.const $push320=, 65535
-; CHECK-NEXT: i32.and $push155=, $pop153, $pop320
-; CHECK-NEXT: i8x16.extract_lane_u $push75=, $1, 15
-; CHECK-NEXT: i32.const $push319=, 31
-; CHECK-NEXT: i32.shl $push77=, $pop75, $pop319
-; CHECK-NEXT: i8x16.extract_lane_u $push70=, $1, 14
-; CHECK-NEXT: i32.const $push318=, 1
-; CHECK-NEXT: i32.and $push71=, $pop70, $pop318
-; CHECK-NEXT: i32.const $push317=, 30
-; CHECK-NEXT: i32.shl $push73=, $pop71, $pop317
-; CHECK-NEXT: i8x16.extract_lane_u $push65=, $1, 13
-; CHECK-NEXT: i32.const $push316=, 1
-; CHECK-NEXT: i32.and $push66=, $pop65, $pop316
-; CHECK-NEXT: i32.const $push315=, 29
-; CHECK-NEXT: i32.shl $push68=, $pop66, $pop315
-; CHECK-NEXT: i8x16.extract_lane_u $push60=, $1, 12
-; CHECK-NEXT: i32.const $push314=, 1
-; CHECK-NEXT: i32.and $push61=, $pop60, $pop314
-; CHECK-NEXT: i32.const $push313=, 28
-; CHECK-NEXT: i32.shl $push63=, $pop61, $pop313
-; CHECK-NEXT: i8x16.extract_lane_u $push55=, $1, 11
-; CHECK-NEXT: i32.const $push312=, 1
-; CHECK-NEXT: i32.and $push56=, $pop55, $pop312
-; CHECK-NEXT: i32.const $push311=, 27
-; CHECK-NEXT: i32.shl $push58=, $pop56, $pop311
-; CHECK-NEXT: i8x16.extract_lane_u $push50=, $1, 10
-; CHECK-NEXT: i32.const $push310=, 1
-; CHECK-NEXT: i32.and $push51=, $pop50, $pop310
-; CHECK-NEXT: i32.const $push309=, 26
-; CHECK-NEXT: i32.shl $push53=, $pop51, $pop309
-; CHECK-NEXT: i8x16.extract_lane_u $push45=, $1, 9
-; CHECK-NEXT: i32.const $push308=, 1
-; CHECK-NEXT: i32.and $push46=, $pop45, $pop308
-; CHECK-NEXT: i32.const $push307=, 25
-; CHECK-NEXT: i32.shl $push48=, $pop46, $pop307
-; CHECK-NEXT: i8x16.extract_lane_u $push40=, $1, 8
-; CHECK-NEXT: i32.const $push306=, 1
-; CHECK-NEXT: i32.and $push41=, $pop40, $pop306
-; CHECK-NEXT: i32.const $push305=, 24
-; CHECK-NEXT: i32.shl $push43=, $pop41, $pop305
-; CHECK-NEXT: i8x16.extract_lane_u $push35=, $1, 7
-; CHECK-NEXT: i32.const $push304=, 1
-; CHECK-NEXT: i32.and $push36=, $pop35, $pop304
-; CHECK-NEXT: i32.const $push303=, 23
-; CHECK-NEXT: i32.shl $push38=, $pop36, $pop303
-; CHECK-NEXT: i8x16.extract_lane_u $push30=, $1, 6
-; CHECK-NEXT: i32.const $push302=, 1
-; CHECK-NEXT: i32.and $push31=, $pop30, $pop302
-; CHECK-NEXT: i32.const $push301=, 22
-; CHECK-NEXT: i32.shl $push33=, $pop31, $pop301
-; CHECK-NEXT: i8x16.extract_lane_u $push25=, $1, 5
-; CHECK-NEXT: i32.const $push300=, 1
-; CHECK-NEXT: i32.and $push26=, $pop25, $pop300
-; CHECK-NEXT: i32.const $push299=, 21
-; CHECK-NEXT: i32.shl $push28=, $pop26, $pop299
-; CHECK-NEXT: i8x16.extract_lane_u $push20=, $1, 4
+; CHECK-NEXT: i32.and $push95=, $pop94, $pop323
+; CHECK-NEXT: i32.const $push322=, 4
+; CHECK-NEXT: i32.shl $push97=, $pop95, $pop322
+; CHECK-NEXT: i32.or $push98=, $pop93, $pop97
+; CHECK-NEXT: i8x16.extract_lane_u $push99=, $0, 5
+; CHECK-NEXT: i32.const $push321=, 1
+; CHECK-NEXT: i32.and $push100=, $pop99, $pop321
+; CHECK-NEXT: i32.const $push320=, 5
+; CHECK-NEXT: i32.shl $push102=, $pop100, $pop320
+; CHECK-NEXT: i32.or $push103=, $pop98, $pop102
+; CHECK-NEXT: i8x16.extract_lane_u $push104=, $0, 6
+; CHECK-NEXT: i32.const $push319=, 1
+; CHECK-NEXT: i32.and $push105=, $pop104, $pop319
+; CHECK-NEXT: i32.const $push318=, 6
+; CHECK-NEXT: i32.shl $push107=, $pop105, $pop318
+; CHECK-NEXT: i32.or $push108=, $pop103, $pop107
+; CHECK-NEXT: i8x16.extract_lane_u $push109=, $0, 7
+; CHECK-NEXT: i32.const $push317=, 1
+; CHECK-NEXT: i32.and $push110=, $pop109, $pop317
+; CHECK-NEXT: i32.const $push316=, 7
+; CHECK-NEXT: i32.shl $push112=, $pop110, $pop316
+; CHECK-NEXT: i32.or $push113=, $pop108, $pop112
+; CHECK-NEXT: i8x16.extract_lane_u $push114=, $0, 8
+; CHECK-NEXT: i32.const $push315=, 1
+; CHECK-NEXT: i32.and $push115=, $pop114, $pop315
+; CHECK-NEXT: i32.const $push314=, 8
+; CHECK-NEXT: i32.shl $push117=, $pop115, $pop314
+; CHECK-NEXT: i32.or $push118=, $pop113, $pop117
+; CHECK-NEXT: i8x16.extract_lane_u $push119=, $0, 9
+; CHECK-NEXT: i32.const $push313=, 1
+; CHECK-NEXT: i32.and $push120=, $pop119, $pop313
+; CHECK-NEXT: i32.const $push312=, 9
+; CHECK-NEXT: i32.shl $push122=, $pop120, $pop312
+; CHECK-NEXT: i32.or $push123=, $pop118, $pop122
+; CHECK-NEXT: i8x16.extract_lane_u $push124=, $0, 10
+; CHECK-NEXT: i32.const $push311=, 1
+; CHECK-NEXT: i32.and $push125=, $pop124, $pop311
+; CHECK-NEXT: i32.const $push310=, 10
+; CHECK-NEXT: i32.shl $push127=, $pop125, $pop310
+; CHECK-NEXT: i32.or $push128=, $pop123, $pop127
+; CHECK-NEXT: i8x16.extract_lane_u $push129=, $0, 11
+; CHECK-NEXT: i32.const $push309=, 1
+; CHECK-NEXT: i32.and $push130=, $pop129, $pop309
+; CHECK-NEXT: i32.const $push308=, 11
+; CHECK-NEXT: i32.shl $push132=, $pop130, $pop308
+; CHECK-NEXT: i32.or $push133=, $pop128, $pop132
+; CHECK-NEXT: i8x16.extract_lane_u $push134=, $0, 12
+; CHECK-NEXT: i32.const $push307=, 1
+; CHECK-NEXT: i32.and $push135=, $pop134, $pop307
+; CHECK-NEXT: i32.const $push306=, 12
+; CHECK-NEXT: i32.shl $push137=, $pop135, $pop306
+; CHECK-NEXT: i32.or $push138=, $pop133, $pop137
+; CHECK-NEXT: i8x16.extract_lane_u $push139=, $0, 13
+; CHECK-NEXT: i32.const $push305=, 1
+; CHECK-NEXT: i32.and $push140=, $pop139, $pop305
+; CHECK-NEXT: i32.const $push304=, 13
+; CHECK-NEXT: i32.shl $push142=, $pop140, $pop304
+; CHECK-NEXT: i32.or $push143=, $pop138, $pop142
+; CHECK-NEXT: i8x16.extract_lane_u $push144=, $0, 14
+; CHECK-NEXT: i32.const $push303=, 1
+; CHECK-NEXT: i32.and $push145=, $pop144, $pop303
+; CHECK-NEXT: i32.const $push302=, 14
+; CHECK-NEXT: i32.shl $push147=, $pop145, $pop302
+; CHECK-NEXT: i32.or $push148=, $pop143, $pop147
+; CHECK-NEXT: i8x16.extract_lane_u $push149=, $0, 15
+; CHECK-NEXT: i32.const $push301=, 15
+; CHECK-NEXT: i32.shl $push151=, $pop149, $pop301
+; CHECK-NEXT: i32.or $push152=, $pop148, $pop151
+; CHECK-NEXT: i32.const $push300=, 65535
+; CHECK-NEXT: i32.and $push154=, $pop152, $pop300
+; CHECK-NEXT: i8x16.extract_lane_u $push74=, $1, 15
+; CHECK-NEXT: i32.const $push299=, 31
+; CHECK-NEXT: i32.shl $push76=, $pop74, $pop299
+; CHECK-NEXT: i8x16.extract_lane_u $push69=, $1, 14
; CHECK-NEXT: i32.const $push298=, 1
-; CHECK-NEXT: i32.and $push21=, $pop20, $pop298
-; CHECK-NEXT: i32.const $push297=, 20
-; CHECK-NEXT: i32.shl $push23=, $pop21, $pop297
-; CHECK-NEXT: i8x16.extract_lane_u $push15=, $1, 3
+; CHECK-NEXT: i32.and $push70=, $pop69, $pop298
+; CHECK-NEXT: i32.const $push297=, 30
+; CHECK-NEXT: i32.shl $push72=, $pop70, $pop297
+; CHECK-NEXT: i8x16.extract_lane_u $push64=, $1, 13
; CHECK-NEXT: i32.const $push296=, 1
-; CHECK-NEXT: i32.and $push16=, $pop15, $pop296
-; CHECK-NEXT: i32.const $push295=, 19
-; CHECK-NEXT: i32.shl $push18=, $pop16, $pop295
-; CHECK-NEXT: i8x16.extract_lane_u $push10=, $1, 2
+; CHECK-NEXT: i32.and $push65=, $pop64, $pop296
+; CHECK-NEXT: i32.const $push295=, 29
+; CHECK-NEXT: i32.shl $push67=, $pop65, $pop295
+; CHECK-NEXT: i8x16.extract_lane_u $push59=, $1, 12
; CHECK-NEXT: i32.const $push294=, 1
-; CHECK-NEXT: i32.and $push11=, $pop10, $pop294
-; CHECK-NEXT: i32.const $push293=, 18
-; CHECK-NEXT: i32.shl $push13=, $pop11, $pop293
-; CHECK-NEXT: i8x16.extract_lane_u $push5=, $1, 1
+; CHECK-NEXT: i32.and $push60=, $pop59, $pop294
+; CHECK-NEXT: i32.const $push293=, 28
+; CHECK-NEXT: i32.shl $push62=, $pop60, $pop293
+; CHECK-NEXT: i8x16.extract_lane_u $push54=, $1, 11
; CHECK-NEXT: i32.const $push292=, 1
-; CHECK-NEXT: i32.and $push6=, $pop5, $pop292
-; CHECK-NEXT: i32.const $push291=, 17
-; CHECK-NEXT: i32.shl $push8=, $pop6, $pop291
-; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0
+; CHECK-NEXT: i32.and $push55=, $pop54, $pop292
+; CHECK-NEXT: i32.const $push291=, 27
+; CHECK-NEXT: i32.shl $push57=, $pop55, $pop291
+; CHECK-NEXT: i8x16.extract_lane_u $push49=, $1, 10
; CHECK-NEXT: i32.const $push290=, 1
-; CHECK-NEXT: i32.and $push2=, $pop0, $pop290
-; CHECK-NEXT: i32.const $push289=, 16
-; CHECK-NEXT: i32.shl $push4=, $pop2, $pop289
-; CHECK-NEXT: i32.or $push9=, $pop8, $pop4
-; CHECK-NEXT: i32.or $push14=, $pop13, $pop9
-; CHECK-NEXT: i32.or $push19=, $pop18, $pop14
-; CHECK-NEXT: i32.or $push24=, $pop23, $pop19
-; CHECK-NEXT: i32.or $push29=, $pop28, $pop24
-; CHECK-NEXT: i32.or $push34=, $pop33, $pop29
-; CHECK-NEXT: i32.or $push39=, $pop38, $pop34
-; CHECK-NEXT: i32.or $push44=, $pop43, $pop39
-; CHECK-NEXT: i32.or $push49=, $pop48, $pop44
-; CHECK-NEXT: i32.or $push54=, $pop53, $pop49
-; CHECK-NEXT: i32.or $push59=, $pop58, $pop54
-; CHECK-NEXT: i32.or $push64=, $pop63, $pop59
-; CHECK-NEXT: i32.or $push69=, $pop68, $pop64
-; CHECK-NEXT: i32.or $push74=, $pop73, $pop69
-; CHECK-NEXT: i32.or $push78=, $pop77, $pop74
-; CHECK-NEXT: i32.or $push156=, $pop155, $pop78
-; CHECK-NEXT: i64.extend_i32_u $push157=, $pop156
-; CHECK-NEXT: i64.and $push284=, $pop283, $pop157
-; CHECK-NEXT: i64.const $push285=, 4294967295
-; CHECK-NEXT: i64.eq $push286=, $pop284, $pop285
-; CHECK-NEXT: return $pop286
+; CHECK-NEXT: i32.and $push50=, $pop49, $pop290
+; CHECK-NEXT: i32.const $push289=, 26
+; CHECK-NEXT: i32.shl $push52=, $pop50, $pop289
+; CHECK-NEXT: i8x16.extract_lane_u $push44=, $1, 9
+; CHECK-NEXT: i32.const $push288=, 1
+; CHECK-NEXT: i32.and $push45=, $pop44, $pop288
+; CHECK-NEXT: i32.const $push287=, 25
+; CHECK-NEXT: i32.shl $push47=, $pop45, $pop287
+; CHECK-NEXT: i32.or $push48=, $pop47, $4
+; CHECK-NEXT: i32.or $push53=, $pop52, $pop48
+; CHECK-NEXT: i32.or $push58=, $pop57, $pop53
+; CHECK-NEXT: i32.or $push63=, $pop62, $pop58
+; CHECK-NEXT: i32.or $push68=, $pop67, $pop63
+; CHECK-NEXT: i32.or $push73=, $pop72, $pop68
+; CHECK-NEXT: i32.or $push77=, $pop76, $pop73
+; CHECK-NEXT: i32.or $push155=, $pop154, $pop77
+; CHECK-NEXT: i64.extend_i32_u $push156=, $pop155
+; CHECK-NEXT: i64.and $push282=, $pop281, $pop156
+; CHECK-NEXT: i64.const $push283=, 4294967295
+; CHECK-NEXT: i64.eq $push284=, $pop282, $pop283
+; CHECK-NEXT: return $pop284
%bits = trunc <64 x i8> %x to <64 x i1>
%ret = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %bits)
ret i1 %ret
>From 36918c1f2d3203af1ad16cbc7eb5b897f09970a3 Mon Sep 17 00:00:00 2001
From: Yolanda Chen <yolanda.chen at intel.com>
Date: Mon, 1 Jul 2024 19:06:30 +0800
Subject: [PATCH 2/2] minor fix
---
.../Target/WebAssembly/WebAssemblyRegStackify.cpp | 14 +++++---------
llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll | 2 +-
2 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 4eea484c782e9..bc6541680e8ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -714,12 +714,12 @@ class TreeWalkerState {
RangeTy &Range = Worklist.back();
MachineOperand &Op = *Range.begin();
Range = drop_begin(Range);
- if (Op.isReg()) {
+ if (Op.isReg())
cur_stack_depth--;
- }
- if (Range.empty()) {
+
+ if (Range.empty())
Worklist.pop_back();
- }
+
assert(cur_stack_depth >= 0);
assert((Worklist.empty() || !Worklist.back().empty()) &&
"Empty ranges shouldn't remain in the worklist");
@@ -729,9 +729,8 @@ class TreeWalkerState {
template <typename T> int getNumRegs(const T &Range) {
int num = 0;
for (auto it = Range.begin(); it != Range.end(); it++) {
- if (it->isReg()) {
+ if (it->isReg())
num++;
- }
}
return num;
}
@@ -825,7 +824,6 @@ class CommutingState {
Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
// Tentatively commute the operands and try again.
- LLVM_DEBUG(dbgs() << "Commute insert\n");
TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
TreeWalker.resetTopOperands(Insert);
TentativelyCommuting = true;
@@ -877,7 +875,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
// operands off the stack in LIFO order.
CommutingState Commuting;
TreeWalkerState TreeWalker(Insert);
- LLVM_DEBUG(dbgs() << "Walk instruction"; Insert->dump());
while (!TreeWalker.done()) {
MachineOperand &Use = TreeWalker.pop();
@@ -885,7 +882,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
if (!Use.isReg())
continue;
- // here only pop reg in stack.
Register Reg = Use.getReg();
assert(Use.isUse() && "explicit_uses() should only iterate over uses");
assert(!Use.isImplicit() &&
diff --git a/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
index 481a14fde03c9..04995b90a4470 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/reg-stackify-simd.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mattr=+simd128,+relaxed-simd -target-abi=experimental-mv | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128,+relaxed-simd | FileCheck %s
target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
target triple = "wasm32-unknown-wasi"
More information about the llvm-commits
mailing list