[llvm] Handle VECREDUCE intrinsics in NVPTX backend (PR #136253)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 17 21:42:46 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/136253
>From 1c98e5d2b750480d18831c7eb3341c46466d1631 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 16 Apr 2025 17:17:47 -0700
Subject: [PATCH 1/3] [NVPTX] lower VECREDUCE intrinsics to tree reduction
Also adds support for sm_100+ fmax3/fmin3 instructions, introduced in
PTX 8.8.
This method of tree reduction has a few benefits over the default in
DAGTypeLegalizer:
- Produces optimal number of operations supported by the target. Instead
of progresisvely splitting the vector operand top-down, first
scalarize it and then build the tree bottom-up. This uses larger
operations when available and leaves smaller ones for the remaining
elements.
- Faster compile time. Happens in one pass over the intrinsic, rather
than O(N) passes if iteratively splitting the vector operands.
---
llvm/lib/Target/NVPTX/NVPTX.td | 10 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 128 ++++
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 6 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 54 ++
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 +
.../CodeGen/NVPTX/reduction-intrinsics.ll | 694 ++++++++++++++++++
6 files changed, 892 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 5467ae011a208..d4dc278cfa648 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -36,17 +36,19 @@ class FeaturePTX<int version>:
foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
60, 61, 62, 70, 72, 75, 80, 86, 87,
- 89, 90, 100, 101, 120] in
+ 89, 90, 100, 101, 103, 120, 121] in
def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
def SM90a: FeatureSM<"90a", 901>;
def SM100a: FeatureSM<"100a", 1001>;
def SM101a: FeatureSM<"101a", 1011>;
+def SM103a: FeatureSM<"103a", 1031>;
def SM120a: FeatureSM<"120a", 1201>;
+def SM121a: FeatureSM<"121a", 1211>;
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
70, 71, 72, 73, 74, 75, 76, 77, 78,
- 80, 81, 82, 83, 84, 85, 86, 87] in
+ 80, 81, 82, 83, 84, 85, 86, 87, 88] in
def PTX#version: FeaturePTX<version>;
//===----------------------------------------------------------------------===//
@@ -81,8 +83,12 @@ def : Proc<"sm_100", [SM100, PTX86]>;
def : Proc<"sm_100a", [SM100a, PTX86]>;
def : Proc<"sm_101", [SM101, PTX86]>;
def : Proc<"sm_101a", [SM101a, PTX86]>;
+def : Proc<"sm_103", [SM103, PTX88]>;
+def : Proc<"sm_103a", [SM103a, PTX88]>;
def : Proc<"sm_120", [SM120, PTX87]>;
def : Proc<"sm_120a", [SM120a, PTX87]>;
+def : Proc<"sm_121", [SM121, PTX88]>;
+def : Proc<"sm_121a", [SM121a, PTX88]>;
def NVPTXInstrInfo : InstrInfo {
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 277a34173e7b8..4d8dd6603f515 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,6 +85,12 @@ static cl::opt<unsigned> FMAContractLevelOpt(
" 1: do it 2: do it aggressively"),
cl::init(2));
+static cl::opt<bool> DisableFOpTreeReduce(
+ "nvptx-disable-fop-tree-reduce", cl::Hidden,
+ cl::desc("NVPTX Specific: don't emit tree reduction for floating-point "
+ "reduction operations"),
+ cl::init(false));
+
static cl::opt<int> UsePrecDivF32(
"nvptx-prec-divf32", cl::Hidden,
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
@@ -828,6 +834,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (STI.allowFP16Math() || STI.hasBF16Math())
setTargetDAGCombine(ISD::SETCC);
+ // Vector reduction operations. These are transformed into a tree evaluation
+ // of nodes which may or may not be legal.
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ }
+
// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in
@@ -1079,6 +1094,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
+ MAKE_CASE(NVPTXISD::FMAXNUM3)
+ MAKE_CASE(NVPTXISD::FMINNUM3)
+ MAKE_CASE(NVPTXISD::FMAXIMUM3)
+ MAKE_CASE(NVPTXISD::FMINIMUM3)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::STACKRESTORE)
MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -2128,6 +2147,108 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
+/// A generic routine for constructing a tree reduction for a vector operand.
+/// This method differs from iterative splitting in DAGTypeLegalizer by
+/// first scalarizing the vector and then progressively grouping elements
+/// bottom-up. This allows easily building the optimal (minimum) number of nodes
+/// with different numbers of operands (eg. max3 vs max2).
+static SDValue BuildTreeReduction(
+ const SDValue &VectorOp,
+ ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+ const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+ EVT VectorTy = VectorOp.getValueType();
+ EVT EltTy = VectorTy.getVectorElementType();
+ const unsigned NumElts = VectorTy.getVectorNumElements();
+
+ // scalarize vector
+ SmallVector<SDValue> Elements(NumElts);
+ for (unsigned I = 0, E = NumElts; I != E; ++I) {
+ Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorOp,
+ DAG.getConstant(I, DL, MVT::i64));
+ }
+
+ // now build the computation graph in place at each level
+ SmallVector<SDValue> Level = Elements;
+ for (unsigned OpIdx = 0; Level.size() > 1 && OpIdx < Ops.size();) {
+ const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
+
+ // partially reduce all elements in level
+ SmallVector<SDValue> ReducedLevel;
+ unsigned I = 0, E = Level.size();
+ for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
+ // Reduce elements in groups of [DefaultGroupSize], as much as possible.
+ ReducedLevel.push_back(DAG.getNode(
+ DefaultScalarOp, DL, EltTy,
+ ArrayRef<SDValue>(Level).slice(I, DefaultGroupSize), Flags));
+ }
+
+ if (I < E) {
+ if (ReducedLevel.empty()) {
+ // The current operator requires more inputs than there are operands at
+ // this level. Pick a smaller operator and retry.
+ ++OpIdx;
+ assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+ continue;
+ }
+
+ // Otherwise, we just have a remainder, which we push to the next level.
+ for (; I < E; ++I)
+ ReducedLevel.push_back(Level[I]);
+ }
+ Level = ReducedLevel;
+ }
+
+ return *Level.begin();
+}
+
+/// Lower fadd/fmul vector reductions. Builds a computation graph (tree) and
+/// serializes it.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ // If we can't reorder sub-operations, let DAGTypeLegalizer lower this op.
+ if (DisableFOpTreeReduce || !Op->getFlags().hasAllowReassociation())
+ return SDValue();
+
+ EVT EltTy = Op.getOperand(0).getValueType().getVectorElementType();
+ const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+ STI.getPTXVersion() >= 88;
+ SDLoc DL(Op);
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> Operators;
+ switch (Op->getOpcode()) {
+ case ISD::VECREDUCE_FADD:
+ Operators = {{ISD::FADD, 2}};
+ break;
+ case ISD::VECREDUCE_FMUL:
+ Operators = {{ISD::FMUL, 2}};
+ break;
+ case ISD::VECREDUCE_FMAX:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMAXNUM3, 3});
+ Operators.push_back({ISD::FMAXNUM, 2});
+ break;
+ case ISD::VECREDUCE_FMIN:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMINNUM3, 3});
+ Operators.push_back({ISD::FMINNUM, 2});
+ break;
+ case ISD::VECREDUCE_FMAXIMUM:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMAXIMUM3, 3});
+ Operators.push_back({ISD::FMAXIMUM, 2});
+ break;
+ case ISD::VECREDUCE_FMINIMUM:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMINIMUM3, 3});
+ Operators.push_back({ISD::FMINIMUM, 2});
+ break;
+ default:
+ llvm_unreachable("unhandled vecreduce operation");
+ }
+
+ return BuildTreeReduction(Op.getOperand(0), Operators, DL, Op->getFlags(),
+ DAG);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -2905,6 +3026,13 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
+ return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 7a8bf3bf33a94..84f0bb1500f80 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -73,6 +73,11 @@ enum NodeType : unsigned {
UNPACK_VECTOR,
FCOPYSIGN,
+ FMAXNUM3,
+ FMINNUM3,
+ FMAXIMUM3,
+ FMINIMUM3,
+
DYNAMIC_STACKALLOC,
STACKRESTORE,
STACKSAVE,
@@ -296,6 +301,7 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ee6380a8a89c4..d9bf0613b0eaf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -371,6 +371,46 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
+// 3-input min/max (sm_100+) for f32 only
+multiclass FMINIMUMMAXIMUM3<string OpcStr, SDNode OpNode> {
+ def f32rrr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rii_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rrr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rii :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
// Template for instructions which take three FP args. The
// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
//
@@ -1139,6 +1179,20 @@ defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min.NaN", nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max.NaN", nvptx_fmaximum3>;
+
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 9e77f628da7a7..2cc81a064152f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -83,6 +83,8 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
}
unsigned getMinVectorRegisterBitWidth() const { return 32; }
+ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
+
// We don't want to prevent inlining because of target-cpu and -features
// attributes that were added to newer versions of LLVM/Clang: There are
// no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
new file mode 100644
index 0000000000000..b49ecbed18c12
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
+; RUN: -disable-post-ra -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK-SM80 %s
+; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
+; RUN: -disable-post-ra -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
+; RUN: -disable-post-ra -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK-SM100 %s
+; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
+; RUN: -disable-post-ra -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 %}
+target triple = "nvptx64-nvidia-cuda"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check straight-line reduction.
+define float @reduce_fadd(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fadd(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<17>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_param_0];
+; CHECK-SM80-NEXT: add.rn.f32 %f9, %f1, 0f00000000;
+; CHECK-SM80-NEXT: add.rn.f32 %f10, %f9, %f2;
+; CHECK-SM80-NEXT: add.rn.f32 %f11, %f10, %f3;
+; CHECK-SM80-NEXT: add.rn.f32 %f12, %f11, %f4;
+; CHECK-SM80-NEXT: add.rn.f32 %f13, %f12, %f5;
+; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, %f6;
+; CHECK-SM80-NEXT: add.rn.f32 %f15, %f14, %f7;
+; CHECK-SM80-NEXT: add.rn.f32 %f16, %f15, %f8;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<17>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_param_0];
+; CHECK-SM100-NEXT: add.rn.f32 %f9, %f1, 0f00000000;
+; CHECK-SM100-NEXT: add.rn.f32 %f10, %f9, %f2;
+; CHECK-SM100-NEXT: add.rn.f32 %f11, %f10, %f3;
+; CHECK-SM100-NEXT: add.rn.f32 %f12, %f11, %f4;
+; CHECK-SM100-NEXT: add.rn.f32 %f13, %f12, %f5;
+; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, %f6;
+; CHECK-SM100-NEXT: add.rn.f32 %f15, %f14, %f7;
+; CHECK-SM100-NEXT: add.rn.f32 %f16, %f15, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fadd_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fadd_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<17>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_param_0];
+; CHECK-SM80-NEXT: add.rn.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: add.rn.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: add.rn.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: add.rn.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: add.rn.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: add.rn.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: add.rn.f32 %f16, %f15, 0f00000000;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<17>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_param_0];
+; CHECK-SM100-NEXT: add.rn.f32 %f9, %f7, %f8;
+; CHECK-SM100-NEXT: add.rn.f32 %f10, %f5, %f6;
+; CHECK-SM100-NEXT: add.rn.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: add.rn.f32 %f12, %f3, %f4;
+; CHECK-SM100-NEXT: add.rn.f32 %f13, %f1, %f2;
+; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: add.rn.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: add.rn.f32 %f16, %f15, 0f00000000;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define float @reduce_fadd_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fadd_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<15>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fadd_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: add.rn.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: add.rn.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: add.rn.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: add.rn.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: add.rn.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: add.rn.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, 0f00000000;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f14;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<15>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fadd_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: add.rn.f32 %f8, %f5, %f6;
+; CHECK-SM100-NEXT: add.rn.f32 %f9, %f8, %f7;
+; CHECK-SM100-NEXT: add.rn.f32 %f10, %f3, %f4;
+; CHECK-SM100-NEXT: add.rn.f32 %f11, %f1, %f2;
+; CHECK-SM100-NEXT: add.rn.f32 %f12, %f11, %f10;
+; CHECK-SM100-NEXT: add.rn.f32 %f13, %f12, %f9;
+; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, 0f00000000;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f14;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
+ ret float %res
+}
+
+define float @reduce_fmul(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmul(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_param_0];
+; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f1, %f2;
+; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f9, %f3;
+; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f10, %f4;
+; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f11, %f5;
+; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f12, %f6;
+; CHECK-SM80-NEXT: mul.rn.f32 %f14, %f13, %f7;
+; CHECK-SM80-NEXT: mul.rn.f32 %f15, %f14, %f8;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_param_0];
+; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f1, %f2;
+; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f9, %f3;
+; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f10, %f4;
+; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f11, %f5;
+; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f12, %f6;
+; CHECK-SM100-NEXT: mul.rn.f32 %f14, %f13, %f7;
+; CHECK-SM100-NEXT: mul.rn.f32 %f15, %f14, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fmul_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmul_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_param_0];
+; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: mul.rn.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: mul.rn.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_param_0];
+; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f7, %f8;
+; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f5, %f6;
+; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f3, %f4;
+; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f1, %f2;
+; CHECK-SM100-NEXT: mul.rn.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: mul.rn.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define float @reduce_fmul_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmul_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmul_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: mul.rn.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<14>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmul_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: mul.rn.f32 %f8, %f5, %f6;
+; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f8, %f7;
+; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f3, %f4;
+; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f1, %f2;
+; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f11, %f10;
+; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f12, %f9;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
+ ret float %res
+}
+
+define float @reduce_fmax(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_param_0];
+; CHECK-SM80-NEXT: max.f32 %f9, %f4, %f8;
+; CHECK-SM80-NEXT: max.f32 %f10, %f2, %f6;
+; CHECK-SM80-NEXT: max.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: max.f32 %f12, %f3, %f7;
+; CHECK-SM80-NEXT: max.f32 %f13, %f1, %f5;
+; CHECK-SM80-NEXT: max.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: max.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmax(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_param_0];
+; CHECK-SM100-NEXT: max.f32 %f9, %f4, %f8;
+; CHECK-SM100-NEXT: max.f32 %f10, %f2, %f6;
+; CHECK-SM100-NEXT: max.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: max.f32 %f12, %f3, %f7;
+; CHECK-SM100-NEXT: max.f32 %f13, %f1, %f5;
+; CHECK-SM100-NEXT: max.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: max.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fmax_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_param_0];
+; CHECK-SM80-NEXT: max.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: max.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: max.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: max.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: max.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: max.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: max.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmax_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_param_0];
+; CHECK-SM100-NEXT: max.f32 %f9, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: max.f32 %f10, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.f32 %f11, %f10, %f9, %f7;
+; CHECK-SM100-NEXT: max.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define float @reduce_fmax_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmax_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: max.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: max.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: max.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: max.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: max.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: max.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmax_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmax_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: max.f32 %f8, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: max.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.f32 %f10, %f9, %f8, %f7;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f10;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
+ ret float %res
+}
+
+define float @reduce_fmin(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_param_0];
+; CHECK-SM80-NEXT: min.f32 %f9, %f4, %f8;
+; CHECK-SM80-NEXT: min.f32 %f10, %f2, %f6;
+; CHECK-SM80-NEXT: min.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: min.f32 %f12, %f3, %f7;
+; CHECK-SM80-NEXT: min.f32 %f13, %f1, %f5;
+; CHECK-SM80-NEXT: min.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: min.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmin(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_param_0];
+; CHECK-SM100-NEXT: min.f32 %f9, %f4, %f8;
+; CHECK-SM100-NEXT: min.f32 %f10, %f2, %f6;
+; CHECK-SM100-NEXT: min.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: min.f32 %f12, %f3, %f7;
+; CHECK-SM100-NEXT: min.f32 %f13, %f1, %f5;
+; CHECK-SM100-NEXT: min.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: min.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fmin(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fmin_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_param_0];
+; CHECK-SM80-NEXT: min.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: min.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: min.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: min.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: min.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: min.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: min.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmin_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_param_0];
+; CHECK-SM100-NEXT: min.f32 %f9, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: min.f32 %f10, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.f32 %f11, %f10, %f9, %f7;
+; CHECK-SM100-NEXT: min.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define float @reduce_fmin_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmin_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: min.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: min.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: min.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: min.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: min.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: min.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmin_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmin_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: min.f32 %f8, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: min.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.f32 %f10, %f9, %f8, %f7;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f10;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in)
+ ret float %res
+}
+
+define float @reduce_fmaximum(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f4, %f8;
+; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f2, %f6;
+; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: max.NaN.f32 %f12, %f3, %f7;
+; CHECK-SM80-NEXT: max.NaN.f32 %f13, %f1, %f5;
+; CHECK-SM80-NEXT: max.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: max.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmaximum(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f4, %f8;
+; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f2, %f6;
+; CHECK-SM100-NEXT: max.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: max.NaN.f32 %f12, %f3, %f7;
+; CHECK-SM100-NEXT: max.NaN.f32 %f13, %f1, %f5;
+; CHECK-SM100-NEXT: max.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: max.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fmaximum_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: max.NaN.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: max.NaN.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: max.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: max.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmaximum_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.NaN.f32 %f11, %f10, %f9, %f7;
+; CHECK-SM100-NEXT: max.NaN.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define float @reduce_fmaximum_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmaximum_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: max.NaN.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: max.NaN.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fmaximum_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmaximum_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %f8, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f9, %f8, %f7;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f10;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in)
+ ret float %res
+}
+
+define float @reduce_fminimum(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f4, %f8;
+; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f2, %f6;
+; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: min.NaN.f32 %f12, %f3, %f7;
+; CHECK-SM80-NEXT: min.NaN.f32 %f13, %f1, %f5;
+; CHECK-SM80-NEXT: min.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: min.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fminimum(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f4, %f8;
+; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f2, %f6;
+; CHECK-SM100-NEXT: min.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM100-NEXT: min.NaN.f32 %f12, %f3, %f7;
+; CHECK-SM100-NEXT: min.NaN.f32 %f13, %f1, %f5;
+; CHECK-SM100-NEXT: min.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM100-NEXT: min.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ret;
+ %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define float @reduce_fminimum_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f7, %f8;
+; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f5, %f6;
+; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f10, %f9;
+; CHECK-SM80-NEXT: min.NaN.f32 %f12, %f3, %f4;
+; CHECK-SM80-NEXT: min.NaN.f32 %f13, %f1, %f2;
+; CHECK-SM80-NEXT: min.NaN.f32 %f14, %f13, %f12;
+; CHECK-SM80-NEXT: min.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fminimum_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.NaN.f32 %f11, %f10, %f9, %f7;
+; CHECK-SM100-NEXT: min.NaN.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in)
+ ret float %res
+}
+
+; Check tree reducion with non-power of 2 size.
+define float @reduce_fminimum_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .f32 %f<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fminimum_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %f8, %f5, %f6;
+; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f8, %f7;
+; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f3, %f4;
+; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f1, %f2;
+; CHECK-SM80-NEXT: min.NaN.f32 %f12, %f11, %f10;
+; CHECK-SM80-NEXT: min.NaN.f32 %f13, %f12, %f9;
+; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-SM80-NEXT: ret;
+;
+; CHECK-SM100-LABEL: reduce_fminimum_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .f32 %f<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fminimum_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %f8, %f4, %f5, %f6;
+; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f9, %f8, %f7;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f10;
+; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
+ ret float %res
+}
>From aed0f4ac5220c2227d3a6b04f151805888d9afd0 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 17 Apr 2025 20:57:23 -0700
Subject: [PATCH 2/3] [NVPTX] support rest of VECREDUCE intrinsics and other
improvements
- Support all VECREDUCE intrinsics
- Clean up FileCheck directives in lit test
- Also handle sequential lowering in NVPTX backend, where we can still
use larger operations.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 176 +-
.../CodeGen/NVPTX/reduction-intrinsics.ll | 1952 +++++++++++++----
2 files changed, 1718 insertions(+), 410 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4d8dd6603f515..8a112e77115a8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -835,12 +835,22 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine(ISD::SETCC);
// Vector reduction operations. These are transformed into a tree evaluation
- // of nodes which may or may not be legal.
+ // of nodes which may initially be illegal.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
- setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
- ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
- ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
- VT, Custom);
+ MVT EltVT = VT.getVectorElementType();
+ if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
+ EltVT == MVT::f64) {
+ setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ } else if (EltVT.isScalarInteger()) {
+ setOperationAction(
+ {ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND,
+ ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX,
+ ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
+ VT, Custom);
+ }
}
// Promote fp16 arithmetic if fp16 hardware isn't available or the
@@ -2147,29 +2157,17 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-/// A generic routine for constructing a tree reduction for a vector operand.
+/// A generic routine for constructing a tree reduction on a vector operand.
/// This method differs from iterative splitting in DAGTypeLegalizer by
-/// first scalarizing the vector and then progressively grouping elements
-/// bottom-up. This allows easily building the optimal (minimum) number of nodes
-/// with different numbers of operands (eg. max3 vs max2).
+/// progressively grouping elements bottom-up.
static SDValue BuildTreeReduction(
- const SDValue &VectorOp,
+ const SmallVector<SDValue> &Elements, EVT EltTy,
ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
- EVT VectorTy = VectorOp.getValueType();
- EVT EltTy = VectorTy.getVectorElementType();
- const unsigned NumElts = VectorTy.getVectorNumElements();
-
- // scalarize vector
- SmallVector<SDValue> Elements(NumElts);
- for (unsigned I = 0, E = NumElts; I != E; ++I) {
- Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorOp,
- DAG.getConstant(I, DL, MVT::i64));
- }
-
// now build the computation graph in place at each level
SmallVector<SDValue> Level = Elements;
- for (unsigned OpIdx = 0; Level.size() > 1 && OpIdx < Ops.size();) {
+ unsigned OpIdx = 0;
+ while (Level.size() > 1) {
const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
// partially reduce all elements in level
@@ -2201,52 +2199,139 @@ static SDValue BuildTreeReduction(
return *Level.begin();
}
-/// Lower fadd/fmul vector reductions. Builds a computation graph (tree) and
-/// serializes it.
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
- // If we can't reorder sub-operations, let DAGTypeLegalizer lower this op.
- if (DisableFOpTreeReduce || !Op->getFlags().hasAllowReassociation())
+ if (DisableFOpTreeReduce)
return SDValue();
- EVT EltTy = Op.getOperand(0).getValueType().getVectorElementType();
+ SDLoc DL(Op);
+ const SDNodeFlags Flags = Op->getFlags();
+ const SDValue &Vector = Op.getOperand(0);
+ EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;
- SDLoc DL(Op);
- SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> Operators;
+
+ // A list of SDNode opcodes with equivalent semantics, sorted descending by
+ // number of inputs they take.
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+ bool IsReassociatable;
+
switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
- Operators = {{ISD::FADD, 2}};
+ ScalarOps = {{ISD::FADD, 2}};
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMUL:
- Operators = {{ISD::FMUL, 2}};
+ ScalarOps = {{ISD::FMUL, 2}};
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMAX:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMAXNUM3, 3});
- Operators.push_back({ISD::FMAXNUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3});
+ ScalarOps.push_back({ISD::FMAXNUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMIN:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMINNUM3, 3});
- Operators.push_back({ISD::FMINNUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMINNUM3, 3});
+ ScalarOps.push_back({ISD::FMINNUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMAXIMUM:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMAXIMUM3, 3});
- Operators.push_back({ISD::FMAXIMUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3});
+ ScalarOps.push_back({ISD::FMAXIMUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMINIMUM:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMINIMUM3, 3});
- Operators.push_back({ISD::FMINIMUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3});
+ ScalarOps.push_back({ISD::FMINIMUM, 2});
+ IsReassociatable = false;
+ break;
+ case ISD::VECREDUCE_ADD:
+ ScalarOps = {{ISD::ADD, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_MUL:
+ ScalarOps = {{ISD::MUL, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_UMAX:
+ ScalarOps = {{ISD::UMAX, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_UMIN:
+ ScalarOps = {{ISD::UMIN, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_SMAX:
+ ScalarOps = {{ISD::SMAX, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_SMIN:
+ ScalarOps = {{ISD::SMIN, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_AND:
+ ScalarOps = {{ISD::AND, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_OR:
+ ScalarOps = {{ISD::OR, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_XOR:
+ ScalarOps = {{ISD::XOR, 2}};
+ IsReassociatable = true;
break;
default:
llvm_unreachable("unhandled vecreduce operation");
}
- return BuildTreeReduction(Op.getOperand(0), Operators, DL, Op->getFlags(),
- DAG);
+ EVT VectorTy = Vector.getValueType();
+ const unsigned NumElts = VectorTy.getVectorNumElements();
+
+ // scalarize vector
+ SmallVector<SDValue> Elements(NumElts);
+ for (unsigned I = 0, E = NumElts; I != E; ++I) {
+ Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, Vector,
+ DAG.getConstant(I, DL, MVT::i64));
+ }
+
+ // Lower to tree reduction.
+ if (IsReassociatable || Flags.hasAllowReassociation())
+ return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+
+ // Lower to sequential reduction.
+ SDValue Accumulator;
+ for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
+ assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
+ const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
+
+ if (!Accumulator) {
+ if (I + DefaultGroupSize <= NumElts) {
+ Accumulator = DAG.getNode(
+ DefaultScalarOp, DL, EltTy,
+ ArrayRef(Elements).slice(I, I + DefaultGroupSize), Flags);
+ I += DefaultGroupSize;
+ }
+ }
+
+ if (Accumulator) {
+ for (; I + (DefaultGroupSize - 1) <= NumElts; I += DefaultGroupSize - 1) {
+ SmallVector<SDValue> Operands = {Accumulator};
+ for (unsigned K = 0; K < DefaultGroupSize - 1; ++K)
+ Operands.push_back(Elements[I + K]);
+ Accumulator = DAG.getNode(DefaultScalarOp, DL, EltTy, Operands, Flags);
+ }
+ }
+ }
+
+ return Accumulator;
}
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
@@ -3032,6 +3117,15 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index b49ecbed18c12..65e5f6b52678b 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -1,305 +1,433 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
-; RUN: | FileCheck -check-prefixes CHECK-SM80 %s
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s
; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_80 %}
; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
-; RUN: | FileCheck -check-prefixes CHECK-SM100 %s
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s
; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 %}
target triple = "nvptx64-nvidia-cuda"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+; Check straight line reduction.
+define half @reduce_fadd_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fadd_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<18>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT: mov.b16 %rs9, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs10, %rs7, %rs9;
+; CHECK-NEXT: add.rn.f16 %rs11, %rs10, %rs8;
+; CHECK-NEXT: add.rn.f16 %rs12, %rs11, %rs5;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs6;
+; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs3;
+; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs4;
+; CHECK-NEXT: add.rn.f16 %rs16, %rs15, %rs1;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fadd_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fadd_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<18>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: add.rn.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: add.rn.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: mov.b16 %rs16, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs15, %rs16;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fadd_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fadd_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fadd_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fadd_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: mov.b16 %rs8, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs9, %rs1, %rs8;
+; CHECK-NEXT: add.rn.f16 %rs10, %rs9, %rs2;
+; CHECK-NEXT: add.rn.f16 %rs11, %rs10, %rs3;
+; CHECK-NEXT: add.rn.f16 %rs12, %rs11, %rs4;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs5;
+; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs6;
+; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fadd(half 0.0, <7 x half> %in)
+ ret half %res
+}
+
; Check straight-line reduction.
-define float @reduce_fadd(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fadd(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<17>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_param_0];
-; CHECK-SM80-NEXT: add.rn.f32 %f9, %f1, 0f00000000;
-; CHECK-SM80-NEXT: add.rn.f32 %f10, %f9, %f2;
-; CHECK-SM80-NEXT: add.rn.f32 %f11, %f10, %f3;
-; CHECK-SM80-NEXT: add.rn.f32 %f12, %f11, %f4;
-; CHECK-SM80-NEXT: add.rn.f32 %f13, %f12, %f5;
-; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, %f6;
-; CHECK-SM80-NEXT: add.rn.f32 %f15, %f14, %f7;
-; CHECK-SM80-NEXT: add.rn.f32 %f16, %f15, %f8;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f16;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fadd(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<17>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_param_0];
-; CHECK-SM100-NEXT: add.rn.f32 %f9, %f1, 0f00000000;
-; CHECK-SM100-NEXT: add.rn.f32 %f10, %f9, %f2;
-; CHECK-SM100-NEXT: add.rn.f32 %f11, %f10, %f3;
-; CHECK-SM100-NEXT: add.rn.f32 %f12, %f11, %f4;
-; CHECK-SM100-NEXT: add.rn.f32 %f13, %f12, %f5;
-; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, %f6;
-; CHECK-SM100-NEXT: add.rn.f32 %f15, %f14, %f7;
-; CHECK-SM100-NEXT: add.rn.f32 %f16, %f15, %f8;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f16;
-; CHECK-SM100-NEXT: ret;
+define float @reduce_fadd_float(<8 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0];
+; CHECK-NEXT: add.rn.f32 %f9, %f1, 0f00000000;
+; CHECK-NEXT: add.rn.f32 %f10, %f9, %f2;
+; CHECK-NEXT: add.rn.f32 %f11, %f10, %f3;
+; CHECK-NEXT: add.rn.f32 %f12, %f11, %f4;
+; CHECK-NEXT: add.rn.f32 %f13, %f12, %f5;
+; CHECK-NEXT: add.rn.f32 %f14, %f13, %f6;
+; CHECK-NEXT: add.rn.f32 %f15, %f14, %f7;
+; CHECK-NEXT: add.rn.f32 %f16, %f15, %f8;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT: ret;
%res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fadd_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fadd_reassoc(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<17>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_param_0];
-; CHECK-SM80-NEXT: add.rn.f32 %f9, %f7, %f8;
-; CHECK-SM80-NEXT: add.rn.f32 %f10, %f5, %f6;
-; CHECK-SM80-NEXT: add.rn.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: add.rn.f32 %f12, %f3, %f4;
-; CHECK-SM80-NEXT: add.rn.f32 %f13, %f1, %f2;
-; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: add.rn.f32 %f15, %f14, %f11;
-; CHECK-SM80-NEXT: add.rn.f32 %f16, %f15, 0f00000000;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f16;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fadd_reassoc(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<17>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_param_0];
-; CHECK-SM100-NEXT: add.rn.f32 %f9, %f7, %f8;
-; CHECK-SM100-NEXT: add.rn.f32 %f10, %f5, %f6;
-; CHECK-SM100-NEXT: add.rn.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: add.rn.f32 %f12, %f3, %f4;
-; CHECK-SM100-NEXT: add.rn.f32 %f13, %f1, %f2;
-; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: add.rn.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: add.rn.f32 %f16, %f15, 0f00000000;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f16;
-; CHECK-SM100-NEXT: ret;
+define float @reduce_fadd_float_reassoc(<8 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-NEXT: add.rn.f32 %f9, %f7, %f8;
+; CHECK-NEXT: add.rn.f32 %f10, %f5, %f6;
+; CHECK-NEXT: add.rn.f32 %f11, %f10, %f9;
+; CHECK-NEXT: add.rn.f32 %f12, %f3, %f4;
+; CHECK-NEXT: add.rn.f32 %f13, %f1, %f2;
+; CHECK-NEXT: add.rn.f32 %f14, %f13, %f12;
+; CHECK-NEXT: add.rn.f32 %f15, %f14, %f11;
+; CHECK-NEXT: add.rn.f32 %f16, %f15, 0f00000000;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
ret float %res
}
; Check tree reduction with non-power of 2 size.
-define float @reduce_fadd_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fadd_reassoc_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<15>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fadd_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_nonpow2_param_0];
-; CHECK-SM80-NEXT: add.rn.f32 %f8, %f5, %f6;
-; CHECK-SM80-NEXT: add.rn.f32 %f9, %f8, %f7;
-; CHECK-SM80-NEXT: add.rn.f32 %f10, %f3, %f4;
-; CHECK-SM80-NEXT: add.rn.f32 %f11, %f1, %f2;
-; CHECK-SM80-NEXT: add.rn.f32 %f12, %f11, %f10;
-; CHECK-SM80-NEXT: add.rn.f32 %f13, %f12, %f9;
-; CHECK-SM80-NEXT: add.rn.f32 %f14, %f13, 0f00000000;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f14;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fadd_reassoc_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<15>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fadd_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_reassoc_nonpow2_param_0];
-; CHECK-SM100-NEXT: add.rn.f32 %f8, %f5, %f6;
-; CHECK-SM100-NEXT: add.rn.f32 %f9, %f8, %f7;
-; CHECK-SM100-NEXT: add.rn.f32 %f10, %f3, %f4;
-; CHECK-SM100-NEXT: add.rn.f32 %f11, %f1, %f2;
-; CHECK-SM100-NEXT: add.rn.f32 %f12, %f11, %f10;
-; CHECK-SM100-NEXT: add.rn.f32 %f13, %f12, %f9;
-; CHECK-SM100-NEXT: add.rn.f32 %f14, %f13, 0f00000000;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f14;
-; CHECK-SM100-NEXT: ret;
+define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT: add.rn.f32 %f8, %f5, %f6;
+; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7;
+; CHECK-NEXT: add.rn.f32 %f10, %f3, %f4;
+; CHECK-NEXT: add.rn.f32 %f11, %f1, %f2;
+; CHECK-NEXT: add.rn.f32 %f12, %f11, %f10;
+; CHECK-NEXT: add.rn.f32 %f13, %f12, %f9;
+; CHECK-NEXT: add.rn.f32 %f14, %f13, 0f00000000;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f14;
+; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
ret float %res
}
-define float @reduce_fmul(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmul(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<16>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_param_0];
-; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f1, %f2;
-; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f9, %f3;
-; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f10, %f4;
-; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f11, %f5;
-; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f12, %f6;
-; CHECK-SM80-NEXT: mul.rn.f32 %f14, %f13, %f7;
-; CHECK-SM80-NEXT: mul.rn.f32 %f15, %f14, %f8;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fmul(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_param_0];
-; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f1, %f2;
-; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f9, %f3;
-; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f10, %f4;
-; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f11, %f5;
-; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f12, %f6;
-; CHECK-SM100-NEXT: mul.rn.f32 %f14, %f13, %f7;
-; CHECK-SM100-NEXT: mul.rn.f32 %f15, %f14, %f8;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
-; CHECK-SM100-NEXT: ret;
+; Check straight line reduction.
+define half @reduce_fmul_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmul_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs9, %rs7, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs5;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs6;
+; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs3;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs12, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs1;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fmul_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmul_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: mul.rn.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: mul.rn.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmul_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmul_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmul_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: mul.rn.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <7 x half> %in)
+ ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmul_float(<8 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0];
+; CHECK-NEXT: mul.rn.f32 %f9, %f1, %f2;
+; CHECK-NEXT: mul.rn.f32 %f10, %f9, %f3;
+; CHECK-NEXT: mul.rn.f32 %f11, %f10, %f4;
+; CHECK-NEXT: mul.rn.f32 %f12, %f11, %f5;
+; CHECK-NEXT: mul.rn.f32 %f13, %f12, %f6;
+; CHECK-NEXT: mul.rn.f32 %f14, %f13, %f7;
+; CHECK-NEXT: mul.rn.f32 %f15, %f14, %f8;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT: ret;
%res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fmul_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmul_reassoc(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<16>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_param_0];
-; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f7, %f8;
-; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f5, %f6;
-; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f3, %f4;
-; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f1, %f2;
-; CHECK-SM80-NEXT: mul.rn.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: mul.rn.f32 %f15, %f14, %f11;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fmul_reassoc(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_param_0];
-; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f7, %f8;
-; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f5, %f6;
-; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f3, %f4;
-; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f1, %f2;
-; CHECK-SM100-NEXT: mul.rn.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: mul.rn.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
-; CHECK-SM100-NEXT: ret;
+define float @reduce_fmul_float_reassoc(<8 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-NEXT: mul.rn.f32 %f9, %f7, %f8;
+; CHECK-NEXT: mul.rn.f32 %f10, %f5, %f6;
+; CHECK-NEXT: mul.rn.f32 %f11, %f10, %f9;
+; CHECK-NEXT: mul.rn.f32 %f12, %f3, %f4;
+; CHECK-NEXT: mul.rn.f32 %f13, %f1, %f2;
+; CHECK-NEXT: mul.rn.f32 %f14, %f13, %f12;
+; CHECK-NEXT: mul.rn.f32 %f15, %f14, %f11;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
ret float %res
}
; Check tree reduction with non-power of 2 size.
-define float @reduce_fmul_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmul_reassoc_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .f32 %f<14>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmul_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_nonpow2_param_0];
-; CHECK-SM80-NEXT: mul.rn.f32 %f8, %f5, %f6;
-; CHECK-SM80-NEXT: mul.rn.f32 %f9, %f8, %f7;
-; CHECK-SM80-NEXT: mul.rn.f32 %f10, %f3, %f4;
-; CHECK-SM80-NEXT: mul.rn.f32 %f11, %f1, %f2;
-; CHECK-SM80-NEXT: mul.rn.f32 %f12, %f11, %f10;
-; CHECK-SM80-NEXT: mul.rn.f32 %f13, %f12, %f9;
-; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fmul_reassoc_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<14>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmul_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_reassoc_nonpow2_param_0];
-; CHECK-SM100-NEXT: mul.rn.f32 %f8, %f5, %f6;
-; CHECK-SM100-NEXT: mul.rn.f32 %f9, %f8, %f7;
-; CHECK-SM100-NEXT: mul.rn.f32 %f10, %f3, %f4;
-; CHECK-SM100-NEXT: mul.rn.f32 %f11, %f1, %f2;
-; CHECK-SM100-NEXT: mul.rn.f32 %f12, %f11, %f10;
-; CHECK-SM100-NEXT: mul.rn.f32 %f13, %f12, %f9;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f13;
-; CHECK-SM100-NEXT: ret;
+define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT: mul.rn.f32 %f8, %f5, %f6;
+; CHECK-NEXT: mul.rn.f32 %f9, %f8, %f7;
+; CHECK-NEXT: mul.rn.f32 %f10, %f3, %f4;
+; CHECK-NEXT: mul.rn.f32 %f11, %f1, %f2;
+; CHECK-NEXT: mul.rn.f32 %f12, %f11, %f10;
+; CHECK-NEXT: mul.rn.f32 %f13, %f12, %f9;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
ret float %res
}
-define float @reduce_fmax(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmax(
+; Check straight line reduction.
+define half @reduce_fmax_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: max.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: max.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: max.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: max.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: max.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: max.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fmax(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fmax_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmax(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmax_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: max.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: max.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmax(<7 x half> %in)
+ ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmax_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_param_0];
-; CHECK-SM80-NEXT: max.f32 %f9, %f4, %f8;
-; CHECK-SM80-NEXT: max.f32 %f10, %f2, %f6;
-; CHECK-SM80-NEXT: max.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: max.f32 %f12, %f3, %f7;
-; CHECK-SM80-NEXT: max.f32 %f13, %f1, %f5;
-; CHECK-SM80-NEXT: max.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: max.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
+; CHECK-SM80-NEXT: max.f32 %f9, %f1, %f2;
+; CHECK-SM80-NEXT: max.f32 %f10, %f9, %f3;
+; CHECK-SM80-NEXT: max.f32 %f11, %f10, %f4;
+; CHECK-SM80-NEXT: max.f32 %f12, %f11, %f5;
+; CHECK-SM80-NEXT: max.f32 %f13, %f12, %f6;
+; CHECK-SM80-NEXT: max.f32 %f14, %f13, %f7;
+; CHECK-SM80-NEXT: max.f32 %f15, %f14, %f8;
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmax(
+; CHECK-SM100-LABEL: reduce_fmax_float(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_param_0];
-; CHECK-SM100-NEXT: max.f32 %f9, %f4, %f8;
-; CHECK-SM100-NEXT: max.f32 %f10, %f2, %f6;
-; CHECK-SM100-NEXT: max.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: max.f32 %f12, %f3, %f7;
-; CHECK-SM100-NEXT: max.f32 %f13, %f1, %f5;
-; CHECK-SM100-NEXT: max.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: max.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
+; CHECK-SM100-NEXT: max.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.f32 %f10, %f9, %f4, %f5;
+; CHECK-SM100-NEXT: max.f32 %f11, %f10, %f6, %f7;
+; CHECK-SM100-NEXT: max.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fmax_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmax_reassoc(
+define float @reduce_fmax_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_param_0];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
; CHECK-SM80-NEXT: max.f32 %f9, %f7, %f8;
; CHECK-SM80-NEXT: max.f32 %f10, %f5, %f6;
; CHECK-SM80-NEXT: max.f32 %f11, %f10, %f9;
@@ -310,13 +438,13 @@ define float @reduce_fmax_reassoc(<8 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmax_reassoc(
+; CHECK-SM100-LABEL: reduce_fmax_float_reassoc(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_param_0];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
; CHECK-SM100-NEXT: max.f32 %f9, %f4, %f5, %f6;
; CHECK-SM100-NEXT: max.f32 %f10, %f1, %f2, %f3;
; CHECK-SM100-NEXT: max.f32 %f11, %f10, %f9, %f7;
@@ -328,15 +456,15 @@ define float @reduce_fmax_reassoc(<8 x float> %in) {
}
; Check tree reduction with non-power of 2 size.
-define float @reduce_fmax_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmax_reassoc_nonpow2(
+define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float_reassoc_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<14>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmax_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
; CHECK-SM80-NEXT: max.f32 %f8, %f5, %f6;
; CHECK-SM80-NEXT: max.f32 %f9, %f8, %f7;
; CHECK-SM80-NEXT: max.f32 %f10, %f3, %f4;
@@ -346,14 +474,14 @@ define float @reduce_fmax_reassoc_nonpow2(<7 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmax_reassoc_nonpow2(
+; CHECK-SM100-LABEL: reduce_fmax_float_reassoc_nonpow2(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<11>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmax_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
; CHECK-SM100-NEXT: max.f32 %f8, %f4, %f5, %f6;
; CHECK-SM100-NEXT: max.f32 %f9, %f1, %f2, %f3;
; CHECK-SM100-NEXT: max.f32 %f10, %f9, %f8, %f7;
@@ -363,53 +491,127 @@ define float @reduce_fmax_reassoc_nonpow2(<7 x float> %in) {
ret float %res
}
-define float @reduce_fmin(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmin(
+; Check straight line reduction.
+define half @reduce_fmin_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmin_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: min.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: min.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: min.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: min.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: min.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: min.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: min.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fmin(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fmin_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmin_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmin(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmin_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmin_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: min.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: min.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmin(<7 x half> %in)
+ ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmin_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_param_0];
-; CHECK-SM80-NEXT: min.f32 %f9, %f4, %f8;
-; CHECK-SM80-NEXT: min.f32 %f10, %f2, %f6;
-; CHECK-SM80-NEXT: min.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: min.f32 %f12, %f3, %f7;
-; CHECK-SM80-NEXT: min.f32 %f13, %f1, %f5;
-; CHECK-SM80-NEXT: min.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: min.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0];
+; CHECK-SM80-NEXT: min.f32 %f9, %f1, %f2;
+; CHECK-SM80-NEXT: min.f32 %f10, %f9, %f3;
+; CHECK-SM80-NEXT: min.f32 %f11, %f10, %f4;
+; CHECK-SM80-NEXT: min.f32 %f12, %f11, %f5;
+; CHECK-SM80-NEXT: min.f32 %f13, %f12, %f6;
+; CHECK-SM80-NEXT: min.f32 %f14, %f13, %f7;
+; CHECK-SM80-NEXT: min.f32 %f15, %f14, %f8;
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmin(
+; CHECK-SM100-LABEL: reduce_fmin_float(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_param_0];
-; CHECK-SM100-NEXT: min.f32 %f9, %f4, %f8;
-; CHECK-SM100-NEXT: min.f32 %f10, %f2, %f6;
-; CHECK-SM100-NEXT: min.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: min.f32 %f12, %f3, %f7;
-; CHECK-SM100-NEXT: min.f32 %f13, %f1, %f5;
-; CHECK-SM100-NEXT: min.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: min.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0];
+; CHECK-SM100-NEXT: min.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.f32 %f10, %f9, %f4, %f5;
+; CHECK-SM100-NEXT: min.f32 %f11, %f10, %f6, %f7;
+; CHECK-SM100-NEXT: min.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmin(<8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fmin_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmin_reassoc(
+define float @reduce_fmin_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_param_0];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0];
; CHECK-SM80-NEXT: min.f32 %f9, %f7, %f8;
; CHECK-SM80-NEXT: min.f32 %f10, %f5, %f6;
; CHECK-SM80-NEXT: min.f32 %f11, %f10, %f9;
@@ -420,13 +622,13 @@ define float @reduce_fmin_reassoc(<8 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmin_reassoc(
+; CHECK-SM100-LABEL: reduce_fmin_float_reassoc(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_param_0];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0];
; CHECK-SM100-NEXT: min.f32 %f9, %f4, %f5, %f6;
; CHECK-SM100-NEXT: min.f32 %f10, %f1, %f2, %f3;
; CHECK-SM100-NEXT: min.f32 %f11, %f10, %f9, %f7;
@@ -438,15 +640,15 @@ define float @reduce_fmin_reassoc(<8 x float> %in) {
}
; Check tree reduction with non-power of 2 size.
-define float @reduce_fmin_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmin_reassoc_nonpow2(
+define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float_reassoc_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<14>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmin_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
; CHECK-SM80-NEXT: min.f32 %f8, %f5, %f6;
; CHECK-SM80-NEXT: min.f32 %f9, %f8, %f7;
; CHECK-SM80-NEXT: min.f32 %f10, %f3, %f4;
@@ -456,14 +658,14 @@ define float @reduce_fmin_reassoc_nonpow2(<7 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmin_reassoc_nonpow2(
+; CHECK-SM100-LABEL: reduce_fmin_float_reassoc_nonpow2(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<11>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmin_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
; CHECK-SM100-NEXT: min.f32 %f8, %f4, %f5, %f6;
; CHECK-SM100-NEXT: min.f32 %f9, %f1, %f2, %f3;
; CHECK-SM100-NEXT: min.f32 %f10, %f9, %f8, %f7;
@@ -473,53 +675,127 @@ define float @reduce_fmin_reassoc_nonpow2(<7 x float> %in) {
ret float %res
}
-define float @reduce_fmaximum(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmaximum(
+; Check straight-line reduction.
+define half @reduce_fmaximum_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmaximum_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: max.NaN.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: max.NaN.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: max.NaN.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: max.NaN.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fmaximum(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fmaximum_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fmaximum_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmaximum(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fmaximum_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmaximum_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmaximum_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmaximum_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: max.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: max.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fmaximum(<7 x half> %in)
+ ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fmaximum_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_param_0];
-; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f4, %f8;
-; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f2, %f6;
-; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: max.NaN.f32 %f12, %f3, %f7;
-; CHECK-SM80-NEXT: max.NaN.f32 %f13, %f1, %f5;
-; CHECK-SM80-NEXT: max.NaN.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: max.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f1, %f2;
+; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f9, %f3;
+; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f10, %f4;
+; CHECK-SM80-NEXT: max.NaN.f32 %f12, %f11, %f5;
+; CHECK-SM80-NEXT: max.NaN.f32 %f13, %f12, %f6;
+; CHECK-SM80-NEXT: max.NaN.f32 %f14, %f13, %f7;
+; CHECK-SM80-NEXT: max.NaN.f32 %f15, %f14, %f8;
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmaximum(
+; CHECK-SM100-LABEL: reduce_fmaximum_float(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_param_0];
-; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f4, %f8;
-; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f2, %f6;
-; CHECK-SM100-NEXT: max.NaN.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: max.NaN.f32 %f12, %f3, %f7;
-; CHECK-SM100-NEXT: max.NaN.f32 %f13, %f1, %f5;
-; CHECK-SM100-NEXT: max.NaN.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: max.NaN.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f9, %f4, %f5;
+; CHECK-SM100-NEXT: max.NaN.f32 %f11, %f10, %f6, %f7;
+; CHECK-SM100-NEXT: max.NaN.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fmaximum_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmaximum_reassoc(
+define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_param_0];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0];
; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f7, %f8;
; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f5, %f6;
; CHECK-SM80-NEXT: max.NaN.f32 %f11, %f10, %f9;
@@ -530,13 +806,13 @@ define float @reduce_fmaximum_reassoc(<8 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmaximum_reassoc(
+; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_param_0];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0];
; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f4, %f5, %f6;
; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f1, %f2, %f3;
; CHECK-SM100-NEXT: max.NaN.f32 %f11, %f10, %f9, %f7;
@@ -548,15 +824,15 @@ define float @reduce_fmaximum_reassoc(<8 x float> %in) {
}
; Check tree reduction with non-power of 2 size.
-define float @reduce_fmaximum_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fmaximum_reassoc_nonpow2(
+define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<14>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmaximum_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
; CHECK-SM80-NEXT: max.NaN.f32 %f8, %f5, %f6;
; CHECK-SM80-NEXT: max.NaN.f32 %f9, %f8, %f7;
; CHECK-SM80-NEXT: max.NaN.f32 %f10, %f3, %f4;
@@ -566,14 +842,14 @@ define float @reduce_fmaximum_reassoc_nonpow2(<7 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fmaximum_reassoc_nonpow2(
+; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<11>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmaximum_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
; CHECK-SM100-NEXT: max.NaN.f32 %f8, %f4, %f5, %f6;
; CHECK-SM100-NEXT: max.NaN.f32 %f9, %f1, %f2, %f3;
; CHECK-SM100-NEXT: max.NaN.f32 %f10, %f9, %f8, %f7;
@@ -583,53 +859,127 @@ define float @reduce_fmaximum_reassoc_nonpow2(<7 x float> %in) {
ret float %res
}
-define float @reduce_fminimum(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fminimum(
+; Check straight-line reduction.
+define half @reduce_fminimum_half(<8 x half> %in) {
+; CHECK-LABEL: reduce_fminimum_half(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: min.NaN.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: min.NaN.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: min.NaN.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: min.NaN.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call half @llvm.vector.reduce.fminimum(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction.
+define half @reduce_fminimum_half_reassoc(<8 x half> %in) {
+; CHECK-LABEL: reduce_fminimum_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fminimum(<8 x half> %in)
+ ret half %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-LABEL: reduce_fminimum_half_reassoc_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_fminimum_half_reassoc_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fminimum_half_reassoc_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fminimum_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: min.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: min.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-NEXT: ret;
+ %res = call reassoc half @llvm.vector.reduce.fminimum(<7 x half> %in)
+ ret half %res
+}
+
+; Check straight-line reduction.
+define float @reduce_fminimum_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_float(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_param_0];
-; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f4, %f8;
-; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f2, %f6;
-; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f10, %f9;
-; CHECK-SM80-NEXT: min.NaN.f32 %f12, %f3, %f7;
-; CHECK-SM80-NEXT: min.NaN.f32 %f13, %f1, %f5;
-; CHECK-SM80-NEXT: min.NaN.f32 %f14, %f13, %f12;
-; CHECK-SM80-NEXT: min.NaN.f32 %f15, %f14, %f11;
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f1, %f2;
+; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f9, %f3;
+; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f10, %f4;
+; CHECK-SM80-NEXT: min.NaN.f32 %f12, %f11, %f5;
+; CHECK-SM80-NEXT: min.NaN.f32 %f13, %f12, %f6;
+; CHECK-SM80-NEXT: min.NaN.f32 %f14, %f13, %f7;
+; CHECK-SM80-NEXT: min.NaN.f32 %f15, %f14, %f8;
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fminimum(
+; CHECK-SM100-LABEL: reduce_fminimum_float(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .f32 %f<16>;
+; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_param_0];
-; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f4, %f8;
-; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f2, %f6;
-; CHECK-SM100-NEXT: min.NaN.f32 %f11, %f10, %f9;
-; CHECK-SM100-NEXT: min.NaN.f32 %f12, %f3, %f7;
-; CHECK-SM100-NEXT: min.NaN.f32 %f13, %f1, %f5;
-; CHECK-SM100-NEXT: min.NaN.f32 %f14, %f13, %f12;
-; CHECK-SM100-NEXT: min.NaN.f32 %f15, %f14, %f11;
-; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f15;
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f1, %f2, %f3;
+; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f9, %f4, %f5;
+; CHECK-SM100-NEXT: min.NaN.f32 %f11, %f10, %f6, %f7;
+; CHECK-SM100-NEXT: min.NaN.f32 %f12, %f11, %f8;
+; CHECK-SM100-NEXT: st.param.f32 [func_retval0], %f12;
; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fminimum(<8 x float> %in)
ret float %res
}
; Check tree reduction.
-define float @reduce_fminimum_reassoc(<8 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fminimum_reassoc(
+define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_reassoc_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_param_0];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0];
; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f7, %f8;
; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f5, %f6;
; CHECK-SM80-NEXT: min.NaN.f32 %f11, %f10, %f9;
@@ -640,13 +990,13 @@ define float @reduce_fminimum_reassoc(<8 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fminimum_reassoc(
+; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_reassoc_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_param_0];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0];
; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f4, %f5, %f6;
; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f1, %f2, %f3;
; CHECK-SM100-NEXT: min.NaN.f32 %f11, %f10, %f9, %f7;
@@ -658,15 +1008,15 @@ define float @reduce_fminimum_reassoc(<8 x float> %in) {
}
; Check tree reducion with non-power of 2 size.
-define float @reduce_fminimum_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-SM80-LABEL: reduce_fminimum_reassoc_nonpow2(
+define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .f32 %f<14>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fminimum_reassoc_nonpow2_param_0+24];
-; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_reassoc_nonpow2_param_0+16];
-; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: ld.param.f32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
; CHECK-SM80-NEXT: min.NaN.f32 %f8, %f5, %f6;
; CHECK-SM80-NEXT: min.NaN.f32 %f9, %f8, %f7;
; CHECK-SM80-NEXT: min.NaN.f32 %f10, %f3, %f4;
@@ -676,14 +1026,14 @@ define float @reduce_fminimum_reassoc_nonpow2(<7 x float> %in) {
; CHECK-SM80-NEXT: st.param.f32 [func_retval0], %f13;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_fminimum_reassoc_nonpow2(
+; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc_nonpow2(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .f32 %f<11>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fminimum_reassoc_nonpow2_param_0+24];
-; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_reassoc_nonpow2_param_0+16];
-; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: ld.param.f32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
; CHECK-SM100-NEXT: min.NaN.f32 %f8, %f4, %f5, %f6;
; CHECK-SM100-NEXT: min.NaN.f32 %f9, %f1, %f2, %f3;
; CHECK-SM100-NEXT: min.NaN.f32 %f10, %f9, %f8, %f7;
@@ -692,3 +1042,867 @@ define float @reduce_fminimum_reassoc_nonpow2(<7 x float> %in) {
%res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
ret float %res
}
+
+; Check tree reduction.
+define i16 @reduce_add_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_add_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: add.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: add.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: add.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: add.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: add.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: add.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_add_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-NEXT: add.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: add.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: add.s16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: add.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.add(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_add_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_add_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
+; CHECK-NEXT: add.s32 %r9, %r7, %r8;
+; CHECK-NEXT: add.s32 %r10, %r5, %r6;
+; CHECK-NEXT: add.s32 %r11, %r10, %r9;
+; CHECK-NEXT: add.s32 %r12, %r3, %r4;
+; CHECK-NEXT: add.s32 %r13, %r1, %r2;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.add(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_add_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_add_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
+; CHECK-NEXT: add.s32 %r8, %r5, %r6;
+; CHECK-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-NEXT: add.s32 %r10, %r3, %r4;
+; CHECK-NEXT: add.s32 %r11, %r1, %r2;
+; CHECK-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-NEXT: add.s32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.add(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_mul_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_mul_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: mul.lo.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: mul.lo.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: mul.lo.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: mul.lo.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.mul(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_mul_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_mul_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
+; CHECK-NEXT: mul.lo.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: mul.lo.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: mul.lo.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: mul.lo.s16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: mul.lo.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.mul(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_mul_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_mul_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
+; CHECK-NEXT: mul.lo.s32 %r9, %r7, %r8;
+; CHECK-NEXT: mul.lo.s32 %r10, %r5, %r6;
+; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9;
+; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r4;
+; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r2;
+; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12;
+; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_mul_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
+; CHECK-NEXT: mul.lo.s32 %r8, %r5, %r6;
+; CHECK-NEXT: mul.lo.s32 %r9, %r8, %r7;
+; CHECK-NEXT: mul.lo.s32 %r10, %r3, %r4;
+; CHECK-NEXT: mul.lo.s32 %r11, %r1, %r2;
+; CHECK-NEXT: mul.lo.s32 %r12, %r11, %r10;
+; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.mul(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_umax_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_umax_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.u16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.u16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.u16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.u16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.u16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.u16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_umax_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-NEXT: max.u16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.u16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.u16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.u16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.u16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: max.u16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.umax(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_umax_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_umax_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
+; CHECK-NEXT: max.u32 %r9, %r7, %r8;
+; CHECK-NEXT: max.u32 %r10, %r5, %r6;
+; CHECK-NEXT: max.u32 %r11, %r10, %r9;
+; CHECK-NEXT: max.u32 %r12, %r3, %r4;
+; CHECK-NEXT: max.u32 %r13, %r1, %r2;
+; CHECK-NEXT: max.u32 %r14, %r13, %r12;
+; CHECK-NEXT: max.u32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_umax_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
+; CHECK-NEXT: max.u32 %r8, %r5, %r6;
+; CHECK-NEXT: max.u32 %r9, %r8, %r7;
+; CHECK-NEXT: max.u32 %r10, %r3, %r4;
+; CHECK-NEXT: max.u32 %r11, %r1, %r2;
+; CHECK-NEXT: max.u32 %r12, %r11, %r10;
+; CHECK-NEXT: max.u32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.umax(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_umin_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_umin_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.u16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.u16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.u16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.u16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.u16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.u16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.u16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_umin_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-NEXT: min.u16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.u16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.u16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.u16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.u16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: min.u16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.umin(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_umin_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_umin_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
+; CHECK-NEXT: min.u32 %r9, %r7, %r8;
+; CHECK-NEXT: min.u32 %r10, %r5, %r6;
+; CHECK-NEXT: min.u32 %r11, %r10, %r9;
+; CHECK-NEXT: min.u32 %r12, %r3, %r4;
+; CHECK-NEXT: min.u32 %r13, %r1, %r2;
+; CHECK-NEXT: min.u32 %r14, %r13, %r12;
+; CHECK-NEXT: min.u32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_umin_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
+; CHECK-NEXT: min.u32 %r8, %r5, %r6;
+; CHECK-NEXT: min.u32 %r9, %r8, %r7;
+; CHECK-NEXT: min.u32 %r10, %r3, %r4;
+; CHECK-NEXT: min.u32 %r11, %r1, %r2;
+; CHECK-NEXT: min.u32 %r12, %r11, %r10;
+; CHECK-NEXT: min.u32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.umin(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_smax_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_smax_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_smax_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-NEXT: max.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.s16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: max.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.smax(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_smax_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_smax_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
+; CHECK-NEXT: max.s32 %r9, %r7, %r8;
+; CHECK-NEXT: max.s32 %r10, %r5, %r6;
+; CHECK-NEXT: max.s32 %r11, %r10, %r9;
+; CHECK-NEXT: max.s32 %r12, %r3, %r4;
+; CHECK-NEXT: max.s32 %r13, %r1, %r2;
+; CHECK-NEXT: max.s32 %r14, %r13, %r12;
+; CHECK-NEXT: max.s32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_smax_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
+; CHECK-NEXT: max.s32 %r8, %r5, %r6;
+; CHECK-NEXT: max.s32 %r9, %r8, %r7;
+; CHECK-NEXT: max.s32 %r10, %r3, %r4;
+; CHECK-NEXT: max.s32 %r11, %r1, %r2;
+; CHECK-NEXT: max.s32 %r12, %r11, %r10;
+; CHECK-NEXT: max.s32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.smax(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_smin_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_smin_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_smin_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-NEXT: min.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.s16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: min.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.smin(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_smin_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_smin_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
+; CHECK-NEXT: min.s32 %r9, %r7, %r8;
+; CHECK-NEXT: min.s32 %r10, %r5, %r6;
+; CHECK-NEXT: min.s32 %r11, %r10, %r9;
+; CHECK-NEXT: min.s32 %r12, %r3, %r4;
+; CHECK-NEXT: min.s32 %r13, %r1, %r2;
+; CHECK-NEXT: min.s32 %r14, %r13, %r12;
+; CHECK-NEXT: min.s32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_smin_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
+; CHECK-NEXT: min.s32 %r8, %r5, %r6;
+; CHECK-NEXT: min.s32 %r9, %r8, %r7;
+; CHECK-NEXT: min.s32 %r10, %r3, %r4;
+; CHECK-NEXT: min.s32 %r11, %r1, %r2;
+; CHECK-NEXT: min.s32 %r12, %r11, %r10;
+; CHECK-NEXT: min.s32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.smin(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_and_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_and_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: and.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: and.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: and.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: and.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: and.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_and_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_and_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
+; CHECK-NEXT: and.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: and.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: and.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: and.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: and.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: and.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.and(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_and_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_and_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
+; CHECK-NEXT: and.b32 %r9, %r7, %r8;
+; CHECK-NEXT: and.b32 %r10, %r5, %r6;
+; CHECK-NEXT: and.b32 %r11, %r10, %r9;
+; CHECK-NEXT: and.b32 %r12, %r3, %r4;
+; CHECK-NEXT: and.b32 %r13, %r1, %r2;
+; CHECK-NEXT: and.b32 %r14, %r13, %r12;
+; CHECK-NEXT: and.b32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.and(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_and_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_and_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
+; CHECK-NEXT: and.b32 %r8, %r5, %r6;
+; CHECK-NEXT: and.b32 %r9, %r8, %r7;
+; CHECK-NEXT: and.b32 %r10, %r3, %r4;
+; CHECK-NEXT: and.b32 %r11, %r1, %r2;
+; CHECK-NEXT: and.b32 %r12, %r11, %r10;
+; CHECK-NEXT: and.b32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.and(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_or_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_or_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: or.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: or.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: or.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: or.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: or.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: or.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_or_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_or_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
+; CHECK-NEXT: or.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: or.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: or.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: or.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: or.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.or(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_or_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_or_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
+; CHECK-NEXT: or.b32 %r9, %r7, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r6;
+; CHECK-NEXT: or.b32 %r11, %r10, %r9;
+; CHECK-NEXT: or.b32 %r12, %r3, %r4;
+; CHECK-NEXT: or.b32 %r13, %r1, %r2;
+; CHECK-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-NEXT: or.b32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.or(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_or_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_or_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
+; CHECK-NEXT: or.b32 %r8, %r5, %r6;
+; CHECK-NEXT: or.b32 %r9, %r8, %r7;
+; CHECK-NEXT: or.b32 %r10, %r3, %r4;
+; CHECK-NEXT: or.b32 %r11, %r1, %r2;
+; CHECK-NEXT: or.b32 %r12, %r11, %r10;
+; CHECK-NEXT: or.b32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.or(<7 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction.
+define i16 @reduce_xor_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_xor_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: xor.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: xor.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: xor.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: xor.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: xor.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: xor.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_xor_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.u16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
+; CHECK-NEXT: xor.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: xor.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: xor.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: xor.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: xor.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: xor.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.xor(<7 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction.
+define i32 @reduce_xor_i32(<8 x i32> %in) {
+; CHECK-LABEL: reduce_xor_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
+; CHECK-NEXT: xor.b32 %r9, %r7, %r8;
+; CHECK-NEXT: xor.b32 %r10, %r5, %r6;
+; CHECK-NEXT: xor.b32 %r11, %r10, %r9;
+; CHECK-NEXT: xor.b32 %r12, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r13, %r1, %r2;
+; CHECK-NEXT: xor.b32 %r14, %r13, %r12;
+; CHECK-NEXT: xor.b32 %r15, %r14, %r11;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in)
+ ret i32 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) {
+; CHECK-LABEL: reduce_xor_i32_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
+; CHECK-NEXT: xor.b32 %r8, %r5, %r6;
+; CHECK-NEXT: xor.b32 %r9, %r8, %r7;
+; CHECK-NEXT: xor.b32 %r10, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r11, %r1, %r2;
+; CHECK-NEXT: xor.b32 %r12, %r11, %r10;
+; CHECK-NEXT: xor.b32 %r13, %r12, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT: ret;
+ %res = call i32 @llvm.vector.reduce.xor(<7 x i32> %in)
+ ret i32 %res
+}
>From 4e481f872ac7970b1f718c039e33474794cad2ad Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 17 Apr 2025 21:30:20 -0700
Subject: [PATCH 3/3] [NVPTX] support VECREDUCE_SEQ ops and remove option
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 32 ++++++++-----
.../CodeGen/NVPTX/reduction-intrinsics.ll | 48 +++++++++----------
2 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8a112e77115a8..f3e6da2b357e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,12 +85,6 @@ static cl::opt<unsigned> FMAContractLevelOpt(
" 1: do it 2: do it aggressively"),
cl::init(2));
-static cl::opt<bool> DisableFOpTreeReduce(
- "nvptx-disable-fop-tree-reduce", cl::Hidden,
- cl::desc("NVPTX Specific: don't emit tree reduction for floating-point "
- "reduction operations"),
- cl::init(false));
-
static cl::opt<int> UsePrecDivF32(
"nvptx-prec-divf32", cl::Hidden,
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
@@ -841,6 +835,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
EltVT == MVT::f64) {
setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL,
ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
VT, Custom);
@@ -2204,12 +2199,19 @@ static SDValue BuildTreeReduction(
/// max3/min3 when the target supports them.
SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
- if (DisableFOpTreeReduce)
- return SDValue();
-
SDLoc DL(Op);
const SDNodeFlags Flags = Op->getFlags();
- const SDValue &Vector = Op.getOperand(0);
+ SDValue Vector;
+ SDValue Accumulator;
+ if (Op->getOpcode() == ISD::VECREDUCE_SEQ_FADD ||
+ Op->getOpcode() == ISD::VECREDUCE_SEQ_FMUL) {
+ // special case with accumulator as first arg
+ Accumulator = Op.getOperand(0);
+ Vector = Op.getOperand(1);
+ } else {
+ // default case
+ Vector = Op.getOperand(0);
+ }
EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;
@@ -2221,10 +2223,12 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_SEQ_FADD:
ScalarOps = {{ISD::FADD, 2}};
IsReassociatable = false;
break;
case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_SEQ_FMUL:
ScalarOps = {{ISD::FMUL, 2}};
IsReassociatable = false;
break;
@@ -2303,11 +2307,13 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
}
// Lower to tree reduction.
- if (IsReassociatable || Flags.hasAllowReassociation())
+ if (IsReassociatable || Flags.hasAllowReassociation()) {
+ // we don't expect an accumulator for reassociatable vector reduction ops
+ assert(!Accumulator && "unexpected accumulator");
return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+ }
// Lower to sequential reduction.
- SDValue Accumulator;
for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
@@ -3113,6 +3119,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_SEQ_FADD:
+ case ISD::VECREDUCE_SEQ_FMUL:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 65e5f6b52678b..a9101ba3ca651 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -23,19 +23,19 @@ define half @reduce_fadd_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT: mov.b16 %rs9, 0x0000;
-; CHECK-NEXT: add.rn.f16 %rs10, %rs7, %rs9;
-; CHECK-NEXT: add.rn.f16 %rs11, %rs10, %rs8;
-; CHECK-NEXT: add.rn.f16 %rs12, %rs11, %rs5;
-; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs6;
-; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs3;
-; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs4;
-; CHECK-NEXT: add.rn.f16 %rs16, %rs15, %rs1;
-; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs2;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: mov.b16 %rs3, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs4, %rs1, %rs3;
+; CHECK-NEXT: add.rn.f16 %rs5, %rs4, %rs2;
+; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r2;
+; CHECK-NEXT: add.rn.f16 %rs8, %rs5, %rs6;
+; CHECK-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: mov.b32 {%rs10, %rs11}, %r3;
+; CHECK-NEXT: add.rn.f16 %rs12, %rs9, %rs10;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs11;
+; CHECK-NEXT: mov.b32 {%rs14, %rs15}, %r4;
+; CHECK-NEXT: add.rn.f16 %rs16, %rs13, %rs14;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs15;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
@@ -174,17 +174,17 @@ define half @reduce_fmul_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT: mul.rn.f16 %rs9, %rs7, %rs8;
-; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs5;
-; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs6;
-; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs3;
-; CHECK-NEXT: mul.rn.f16 %rs13, %rs12, %rs4;
-; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs1;
-; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs2;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: mul.rn.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs13;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
More information about the llvm-commits
mailing list