[llvm] [TTI][WebAssembly] Pairwise reduction expansion (PR #93948)

Fri Jul 12 00:47:15 PDT 2024

https://github.com/sparker-arm updated https://github.com/llvm/llvm-project/pull/93948

>From ddef2e51ed10ea7bfb2a682649d14a0c008463a3 Mon Sep 17 00:00:00 2001
From: Samuel Parker <sam.parker at arm.com>
Date: Fri, 31 May 2024 10:53:26 +0100
Subject: [PATCH] [TTI][WebAssembly] Pairwise reduction expansion

WebAssembly doesn't support horizontal operations nor does it have a
way of expressing fast-math or reassoc flags, so runtimes are
currently unable to use pairwise operations when generating code from
the existing shuffle patterns.

This patch allows the backend to select which, arbitary, shuffle
pattern to be used per reduction intrinsic. The default behaviour is
the same as the existing, which is by splitting the vector into a top
and bottom half. The other pattern introduced is for a pairwise
shuffle.

WebAssembly enables pairwise reductions for fp add.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   14 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    5 +
 .../include/llvm/Transforms/Utils/LoopUtils.h |    2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |    6 +
 llvm/lib/CodeGen/ExpandReductions.cpp         |   10 +-
 .../WebAssemblyTargetTransformInfo.cpp        |   12 +
 .../WebAssemblyTargetTransformInfo.h          |    2 +
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   42 +-
 .../test/CodeGen/WebAssembly/vector-reduce.ll | 1074 +++++++++++++++++
 9 files changed, 1151 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/vector-reduce.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index dcdd9f82cde8e..bda9d4e624505 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1705,6 +1705,13 @@ class TargetTransformInfo {
   /// into a shuffle sequence.
   bool shouldExpandReduction(const IntrinsicInst *II) const;
 
+  enum struct ReductionShuffle { SplitHalf, Pairwise };
+
+  /// \returns The shuffle sequence pattern used to expand the given reduction
+  /// intrinsic.
+  ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
+
   /// \returns the size cost of rematerializing a GlobalValue address relative
   /// to a stack reload.
   unsigned getGISelRematGlobalCost() const;
@@ -2156,6 +2163,8 @@ class TargetTransformInfo::Concept {
   virtual bool preferEpilogueVectorization() const = 0;
 
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
+  virtual ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const = 0;
   virtual unsigned getGISelRematGlobalCost() const = 0;
   virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
   virtual bool enableScalableVectorization() const = 0;
@@ -2898,6 +2907,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldExpandReduction(II);
   }
 
+  ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override {
+    return Impl.getPreferredExpandedReductionShuffle(II);
+  }
+
   unsigned getGISelRematGlobalCost() const override {
     return Impl.getGISelRematGlobalCost();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 01624de190d51..c1eb6151440be 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -936,6 +936,11 @@ class TargetTransformInfoImplBase {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
 
+  TTI::ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const {
+    return TTI::ReductionShuffle::SplitHalf;
+  }
+
   unsigned getGISelRematGlobalCost() const { return 1; }
 
   unsigned getMinTripCountTailFoldingThreshold() const { return 0; }
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 345e09dce0b2b..9b26ad0b2fc8c 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -15,6 +15,7 @@
 
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 namespace llvm {
@@ -384,6 +385,7 @@ Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 /// Generates a vector reduction using shufflevectors to reduce the value.
 /// Fast-math-flags are propagated using the IRBuilder's setting.
 Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
+                           TargetTransformInfo::ReductionShuffle RS,
                            RecurKind MinMaxKind = RecurKind::None);
 
 /// Create a target reduction of the given vector. The reduction operation
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c175d1737e54b..be4069bb3eabf 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1317,6 +1317,12 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
 
+TargetTransformInfo::ReductionShuffle
+TargetTransformInfo::getPreferredExpandedReductionShuffle(
+    const IntrinsicInst *II) const {
+  return TTIImpl->getPreferredExpandedReductionShuffle(II);
+}
+
 unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
   return TTIImpl->getGISelRematGlobalCost();
 }
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index 0b1504e51b1bb..d6778ec666cbe 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -59,6 +59,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
     Intrinsic::ID ID = II->getIntrinsicID();
     RecurKind RK = getMinMaxReductionRecurKind(ID);
+    TargetTransformInfo::ReductionShuffle RS =
+        TTI->getPreferredExpandedReductionShuffle(II);
 
     Value *Rdx = nullptr;
     IRBuilder<> Builder(II);
@@ -79,7 +81,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         if (!isPowerOf2_32(
                 cast<FixedVectorType>(Vec->getType())->getNumElements()))
           continue;
-        Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+        Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
         Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
                                   "bin.rdx");
       }
@@ -112,7 +114,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         break;
       }
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     case Intrinsic::vector_reduce_add:
@@ -127,7 +129,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     case Intrinsic::vector_reduce_fmax:
@@ -140,7 +142,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
           !FMF.noNaNs())
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 9a434d9b1db54..b109594811d97 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -94,6 +94,18 @@ WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return Cost;
 }
 
+TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
+    const IntrinsicInst *II) const {
+
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::vector_reduce_fadd:
+    return TTI::ReductionShuffle::Pairwise;
+  }
+  return TTI::ReductionShuffle::SplitHalf;
+}
+
 bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
                                              const Function *Callee) const {
   // Allow inlining only when the Callee has a subset of the Caller's
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index e10f0928ed531..269922cc3ea84 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -70,6 +70,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index, Value *Op0, Value *Op1);
 
+  TTI::ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
   /// @}
 
   bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index a127a3265758d..0f9fea8668118 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1077,7 +1077,9 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 
 // Helper to generate a log2 shuffle reduction.
 Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
-                                 unsigned Op, RecurKind RdxKind) {
+                                 unsigned Op,
+                                 TargetTransformInfo::ReductionShuffle RS,
+                                 RecurKind RdxKind) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
@@ -1091,18 +1093,10 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
   // will never be relevant here.  Note that it would be generally unsound to
   // propagate these from an intrinsic call to the expansion anyways as we/
   // change the order of operations.
-  Value *TmpVec = Src;
-  SmallVector<int, 32> ShuffleMask(VF);
-  for (unsigned i = VF; i != 1; i >>= 1) {
-    // Move the upper half of the vector to the lower half.
-    for (unsigned j = 0; j != i / 2; ++j)
-      ShuffleMask[j] = i / 2 + j;
-
-    // Fill the rest of the mask with undef.
-    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
-
+  auto BuildShuffledOp = [&Builder, &Op,
+                          &RdxKind](SmallVectorImpl<int> &ShuffleMask,
+                                    Value *&TmpVec) -> void {
     Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
-
     if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
       TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                    "bin.rdx");
@@ -1111,6 +1105,30 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
              "Invalid min/max");
       TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
     }
+  };
+
+  Value *TmpVec = Src;
+  if (TargetTransformInfo::ReductionShuffle::Pairwise == RS) {
+    SmallVector<int, 32> ShuffleMask(VF);
+    for (unsigned stride = 1; stride < VF; stride <<= 1) {
+      // Initialise the mask with undef.
+      std::fill(ShuffleMask.begin(), ShuffleMask.end(), -1);
+      for (unsigned j = 0; j < VF; j += stride << 1) {
+        ShuffleMask[j] = j + stride;
+      }
+      BuildShuffledOp(ShuffleMask, TmpVec);
+    }
+  } else {
+    SmallVector<int, 32> ShuffleMask(VF);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i / 2; ++j)
+        ShuffleMask[j] = i / 2 + j;
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
+      BuildShuffledOp(ShuffleMask, TmpVec);
+    }
   }
   // The result is in the first element of the vector.
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
new file mode 100644
index 0000000000000..4b1a1a8a0c5b6
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -0,0 +1,1074 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+
+define i64 @pairwise_add_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_add_v2i64:
+; SIMD128:         .functype pairwise_add_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    i64x2.add $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.add.i64.v4i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_add_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_add_v4i32:
+; SIMD128:         .functype pairwise_add_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_add_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_add_v8i16:
+; SIMD128:         .functype pairwise_add_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_add_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_add_v16i8:
+; SIMD128:         .functype pairwise_add_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_mul_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2i64:
+; SIMD128:         .functype pairwise_mul_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    i64x2.mul $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_mul_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4i32:
+; SIMD128:         .functype pairwise_mul_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.mul $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.mul $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_mul_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_mul_v8i16:
+; SIMD128:         .functype pairwise_mul_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_mul_v16i8:
+; SIMD128:         .functype pairwise_mul_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $0, 0
+; SIMD128-NEXT:    i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push31=, $1=, $pop32
+; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $pop31, 0
+; SIMD128-NEXT:    i32.mul $push27=, $pop26, $pop25
+; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
+; SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i32.mul $push28=, $pop27, $pop24
+; SIMD128-NEXT:    i8x16.extract_lane_u $push19=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $1, 2
+; SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push15=, $1, 6
+; SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop17
+; SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop21
+; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push10=, $1, 1
+; SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $1, 5
+; SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; SIMD128-NEXT:    i32.mul $push13=, $pop12, $pop9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 3
+; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 7
+; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop2
+; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop6
+; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop14
+; SIMD128-NEXT:    return $pop30
+  %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_and_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_and_v2i64:
+; SIMD128:         .functype pairwise_and_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_and_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_and_v4i32:
+; SIMD128:         .functype pairwise_and_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.and $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.and $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_and_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_and_v8i16:
+; SIMD128:         .functype pairwise_and_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_and_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_and_v16i8:
+; SIMD128:         .functype pairwise_and_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_or_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_or_v2i64:
+; SIMD128:         .functype pairwise_or_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.or $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_or_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_or_v4i32:
+; SIMD128:         .functype pairwise_or_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.or $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.or $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_or_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_or_v8i16:
+; SIMD128:         .functype pairwise_or_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_or_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_or_v16i8:
+; SIMD128:         .functype pairwise_or_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_xor_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_xor_v2i64:
+; SIMD128:         .functype pairwise_xor_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.xor $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_xor_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_xor_v4i32:
+; SIMD128:         .functype pairwise_xor_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.xor $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.xor $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_xor_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_xor_v8i16:
+; SIMD128:         .functype pairwise_xor_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_xor_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_xor_v16i8:
+; SIMD128:         .functype pairwise_xor_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_smax_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_smax_v2i64:
+; SIMD128:         .functype pairwise_smax_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    i64x2.gt_s $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_smax_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_smax_v4i32:
+; SIMD128:         .functype pairwise_smax_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_s $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_s $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_smax_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_smax_v8i16:
+; SIMD128:         .functype pairwise_smax_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_smax_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_smax_v16i8:
+; SIMD128:         .functype pairwise_smax_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_smin_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_smin_v2i64:
+; SIMD128:         .functype pairwise_smin_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    i64x2.lt_s $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_smin_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_smin_v4i32:
+; SIMD128:         .functype pairwise_smin_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_s $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_s $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_smin_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_smin_v8i16:
+; SIMD128:         .functype pairwise_smin_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_smin_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_smin_v16i8:
+; SIMD128:         .functype pairwise_smin_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_umax_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_umax_v2i64:
+; SIMD128:         .functype pairwise_umax_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT:    i64.const $push4=, -1
+; SIMD128-NEXT:    i64.const $push3=, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 0
+; SIMD128-NEXT:    i64.gt_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64.select $push5=, $pop4, $pop3, $pop2
+; SIMD128-NEXT:    i64x2.replace_lane $push6=, $0, 0, $pop5
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop9, $pop6
+; SIMD128-NEXT:    i64x2.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
+  %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_umax_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_umax_v4i32:
+; SIMD128:         .functype pairwise_umax_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_u $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_u $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_umax_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_umax_v8i16:
+; SIMD128:         .functype pairwise_umax_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_umax_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_umax_v16i8:
+; SIMD128:         .functype pairwise_umax_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_umin_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_umin_v2i64:
+; SIMD128:         .functype pairwise_umin_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT:    i64.const $push4=, -1
+; SIMD128-NEXT:    i64.const $push3=, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 0
+; SIMD128-NEXT:    i64.lt_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64.select $push5=, $pop4, $pop3, $pop2
+; SIMD128-NEXT:    i64x2.replace_lane $push6=, $0, 0, $pop5
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop9, $pop6
+; SIMD128-NEXT:    i64x2.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
+  %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_umin_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_umin_v4i32:
+; SIMD128:         .functype pairwise_umin_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_u $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_u $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_umin_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_umin_v8i16:
+; SIMD128:         .functype pairwise_umin_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_umin_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_umin_v16i8:
+; SIMD128:         .functype pairwise_umin_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define double @pairwise_add_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_add_v2f64:
+; SIMD128:         .functype pairwise_add_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.add $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_add_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_add_v2f64_fast:
+; SIMD128:         .functype pairwise_add_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    f64x2.add $push1=, $0, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_add_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32:
+; SIMD128:         .functype pairwise_add_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.add $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.add $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.add $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_add_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32_fast:
+; SIMD128:         .functype pairwise_add_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_add_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32_reassoc:
+; SIMD128:         .functype pairwise_add_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_mul_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2f64:
+; SIMD128:         .functype pairwise_mul_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    f64.const $push1=, -0x0p0
+; SIMD128-NEXT:    f64.mul $push2=, $pop0, $pop1
+; SIMD128-NEXT:    f64x2.extract_lane $push3=, $0, 1
+; SIMD128-NEXT:    f64.mul $push4=, $pop2, $pop3
+; SIMD128-NEXT:    return $pop4
+  %res = tail call double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_mul_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2f64_fast:
+; SIMD128:         .functype pairwise_mul_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64.const $push0=, 0x0p0
+; SIMD128-NEXT:    return $pop0
+  %res = tail call fast double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_mul_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32:
+; SIMD128:         .functype pairwise_mul_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    f32.const $push1=, -0x0p0
+; SIMD128-NEXT:    f32.mul $push2=, $pop0, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 1
+; SIMD128-NEXT:    f32.mul $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 2
+; SIMD128-NEXT:    f32.mul $push6=, $pop4, $pop5
+; SIMD128-NEXT:    f32x4.extract_lane $push7=, $0, 3
+; SIMD128-NEXT:    f32.mul $push8=, $pop6, $pop7
+; SIMD128-NEXT:    return $pop8
+  %res = tail call float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_mul_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32_fast:
+; SIMD128:         .functype pairwise_mul_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32.const $push0=, 0x0p0
+; SIMD128-NEXT:    return $pop0
+  %res = tail call fast float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_mul_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32_reassoc:
+; SIMD128:         .functype pairwise_mul_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.mul $push7=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.mul $push2=, $pop6, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    f32.const $push4=, -0x0p0
+; SIMD128-NEXT:    f32.mul $push5=, $pop3, $pop4
+; SIMD128-NEXT:    return $pop5
+  %res = tail call reassoc float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_max_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_max_v2f64:
+; SIMD128:         .functype pairwise_max_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    call $push2=, fmax, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_max_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_max_v2f64_fast:
+; SIMD128:         .functype pairwise_max_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    f64x2.gt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_max_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32:
+; SIMD128:         .functype pairwise_max_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fmaxf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fmaxf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fmaxf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_max_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32_fast:
+; SIMD128:         .functype pairwise_max_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
+; SIMD128-NEXT:    f32x4.gt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
+; SIMD128-NEXT:    f32x4.gt $push1=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_max_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32_reassoc:
+; SIMD128:         .functype pairwise_max_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fmaxf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fmaxf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fmaxf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_min_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_min_v2f64:
+; SIMD128:         .functype pairwise_min_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    call $push2=, fmin, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_min_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_min_v2f64_fast:
+; SIMD128:         .functype pairwise_min_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    f64x2.lt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_min_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32:
+; SIMD128:         .functype pairwise_min_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fminf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fminf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fminf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_min_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32_fast:
+; SIMD128:         .functype pairwise_min_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
+; SIMD128-NEXT:    f32x4.lt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
+; SIMD128-NEXT:    f32x4.lt $push1=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_min_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32_reassoc:
+; SIMD128:         .functype pairwise_min_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fminf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fminf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fminf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_maximum_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v2f64:
+; SIMD128:         .functype pairwise_maximum_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_maximum_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v2f64_fast:
+; SIMD128:         .functype pairwise_maximum_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_maximum_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32:
+; SIMD128:         .functype pairwise_maximum_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_maximum_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32_fast:
+; SIMD128:         .functype pairwise_maximum_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_maximum_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32_reassoc:
+; SIMD128:         .functype pairwise_maximum_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_minimum_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v2f64:
+; SIMD128:         .functype pairwise_minimum_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_minimum_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v2f64_fast:
+; SIMD128:         .functype pairwise_minimum_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_minimum_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32:
+; SIMD128:         .functype pairwise_minimum_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_minimum_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32_fast:
+; SIMD128:         .functype pairwise_minimum_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call fast float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_minimum_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32_reassoc:
+; SIMD128:         .functype pairwise_minimum_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}