[llvm] Handle VECREDUCE intrinsics in NVPTX backend (PR #136253)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu May 22 04:10:58 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/136253
>From 7c58476b4788f7223d8c05204a4093e46fa9a895 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 16 Apr 2025 17:17:47 -0700
Subject: [PATCH 1/4] [NVPTX] lower VECREDUCE intrinsics to tree reduction
Also adds support for sm_100+ fmax3/fmin3 instructions, introduced in
PTX 8.8.
This method of tree reduction has a few benefits over the default in
DAGTypeLegalizer:
- Produces optimal number of operations supported by the target. Instead
of progresisvely splitting the vector operand top-down, first
scalarize it and then build the tree bottom-up. This uses larger
operations when available and leaves smaller ones for the remaining
elements.
- Faster compile time. Happens in one pass over the intrinsic, rather
than O(N) passes if iteratively splitting the vector operands.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 128 ++
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 6 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 54 +
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 +
.../CodeGen/NVPTX/reduction-intrinsics.ll | 1927 ++++++++++++-----
5 files changed, 1601 insertions(+), 516 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 51f4682c5ba15..bafd91b930052 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,6 +85,12 @@ static cl::opt<unsigned> FMAContractLevelOpt(
" 1: do it 2: do it aggressively"),
cl::init(2));
+static cl::opt<bool> DisableFOpTreeReduce(
+ "nvptx-disable-fop-tree-reduce", cl::Hidden,
+ cl::desc("NVPTX Specific: don't emit tree reduction for floating-point "
+ "reduction operations"),
+ cl::init(false));
+
static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32(
"nvptx-prec-divf32", cl::Hidden,
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
@@ -863,6 +869,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (STI.allowFP16Math() || STI.hasBF16Math())
setTargetDAGCombine(ISD::SETCC);
+ // Vector reduction operations. These are transformed into a tree evaluation
+ // of nodes which may or may not be legal.
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ }
+
// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in
@@ -1120,6 +1135,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
+ MAKE_CASE(NVPTXISD::FMAXNUM3)
+ MAKE_CASE(NVPTXISD::FMINNUM3)
+ MAKE_CASE(NVPTXISD::FMAXIMUM3)
+ MAKE_CASE(NVPTXISD::FMINIMUM3)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::STACKRESTORE)
MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -2194,6 +2213,108 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
+/// A generic routine for constructing a tree reduction for a vector operand.
+/// This method differs from iterative splitting in DAGTypeLegalizer by
+/// first scalarizing the vector and then progressively grouping elements
+/// bottom-up. This allows easily building the optimal (minimum) number of nodes
+/// with different numbers of operands (eg. max3 vs max2).
+static SDValue BuildTreeReduction(
+ const SDValue &VectorOp,
+ ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+ const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+ EVT VectorTy = VectorOp.getValueType();
+ EVT EltTy = VectorTy.getVectorElementType();
+ const unsigned NumElts = VectorTy.getVectorNumElements();
+
+ // scalarize vector
+ SmallVector<SDValue> Elements(NumElts);
+ for (unsigned I = 0, E = NumElts; I != E; ++I) {
+ Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorOp,
+ DAG.getConstant(I, DL, MVT::i64));
+ }
+
+ // now build the computation graph in place at each level
+ SmallVector<SDValue> Level = Elements;
+ for (unsigned OpIdx = 0; Level.size() > 1 && OpIdx < Ops.size();) {
+ const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
+
+ // partially reduce all elements in level
+ SmallVector<SDValue> ReducedLevel;
+ unsigned I = 0, E = Level.size();
+ for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
+ // Reduce elements in groups of [DefaultGroupSize], as much as possible.
+ ReducedLevel.push_back(DAG.getNode(
+ DefaultScalarOp, DL, EltTy,
+ ArrayRef<SDValue>(Level).slice(I, DefaultGroupSize), Flags));
+ }
+
+ if (I < E) {
+ if (ReducedLevel.empty()) {
+ // The current operator requires more inputs than there are operands at
+ // this level. Pick a smaller operator and retry.
+ ++OpIdx;
+ assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+ continue;
+ }
+
+ // Otherwise, we just have a remainder, which we push to the next level.
+ for (; I < E; ++I)
+ ReducedLevel.push_back(Level[I]);
+ }
+ Level = ReducedLevel;
+ }
+
+ return *Level.begin();
+}
+
+/// Lower fadd/fmul vector reductions. Builds a computation graph (tree) and
+/// serializes it.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ // If we can't reorder sub-operations, let DAGTypeLegalizer lower this op.
+ if (DisableFOpTreeReduce || !Op->getFlags().hasAllowReassociation())
+ return SDValue();
+
+ EVT EltTy = Op.getOperand(0).getValueType().getVectorElementType();
+ const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+ STI.getPTXVersion() >= 88;
+ SDLoc DL(Op);
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> Operators;
+ switch (Op->getOpcode()) {
+ case ISD::VECREDUCE_FADD:
+ Operators = {{ISD::FADD, 2}};
+ break;
+ case ISD::VECREDUCE_FMUL:
+ Operators = {{ISD::FMUL, 2}};
+ break;
+ case ISD::VECREDUCE_FMAX:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMAXNUM3, 3});
+ Operators.push_back({ISD::FMAXNUM, 2});
+ break;
+ case ISD::VECREDUCE_FMIN:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMINNUM3, 3});
+ Operators.push_back({ISD::FMINNUM, 2});
+ break;
+ case ISD::VECREDUCE_FMAXIMUM:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMAXIMUM3, 3});
+ Operators.push_back({ISD::FMAXIMUM, 2});
+ break;
+ case ISD::VECREDUCE_FMINIMUM:
+ if (CanUseMinMax3)
+ Operators.push_back({NVPTXISD::FMINIMUM3, 3});
+ Operators.push_back({ISD::FMINIMUM, 2});
+ break;
+ default:
+ llvm_unreachable("unhandled vecreduce operation");
+ }
+
+ return BuildTreeReduction(Op.getOperand(0), Operators, DL, Op->getFlags(),
+ DAG);
+}
+
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
@@ -3026,6 +3147,13 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
+ return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..b33111c42a166 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -73,6 +73,11 @@ enum NodeType : unsigned {
UNPACK_VECTOR,
FCOPYSIGN,
+ FMAXNUM3,
+ FMINNUM3,
+ FMAXIMUM3,
+ FMINIMUM3,
+
DYNAMIC_STACKALLOC,
STACKRESTORE,
STACKSAVE,
@@ -299,6 +304,7 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5234fb0806189..c99e78526b8e1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -403,6 +403,46 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
+// 3-input min/max (sm_100+) for f32 only
+multiclass FMINIMUMMAXIMUM3<string OpcStr, SDNode OpNode> {
+ def f32rrr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rii_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+ def f32rrr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+ def f32rii :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
// Template for instructions which take three FP args. The
// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
//
@@ -1181,6 +1221,20 @@ defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+ [SDNPCommutative, SDNPAssociative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min.NaN", nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max.NaN", nvptx_fmaximum3>;
+
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 98aea4e535f0a..49c9236e02d52 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -87,6 +87,8 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
}
unsigned getMinVectorRegisterBitWidth() const override { return 32; }
+ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
+
// We don't want to prevent inlining because of target-cpu and -features
// attributes that were added to newer versions of LLVM/Clang: There are
// no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index d5b451dad7bc3..90c6b0ebb6725 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s
-; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
+; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_80 %}
; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s
-; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \
+; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 %}
target triple = "nvptx64-nvidia-cuda"
@@ -23,19 +23,19 @@ define half @reduce_fadd_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT: mov.b16 %rs3, 0x0000;
-; CHECK-NEXT: add.rn.f16 %rs4, %rs1, %rs3;
-; CHECK-NEXT: add.rn.f16 %rs5, %rs4, %rs2;
-; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r2;
-; CHECK-NEXT: add.rn.f16 %rs8, %rs5, %rs6;
-; CHECK-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
-; CHECK-NEXT: mov.b32 {%rs10, %rs11}, %r3;
-; CHECK-NEXT: add.rn.f16 %rs12, %rs9, %rs10;
-; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs11;
-; CHECK-NEXT: mov.b32 {%rs14, %rs15}, %r4;
-; CHECK-NEXT: add.rn.f16 %rs16, %rs13, %rs14;
-; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs15;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT: mov.b16 %rs9, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs10, %rs7, %rs9;
+; CHECK-NEXT: add.rn.f16 %rs11, %rs10, %rs8;
+; CHECK-NEXT: add.rn.f16 %rs12, %rs11, %rs5;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs6;
+; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs3;
+; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs4;
+; CHECK-NEXT: add.rn.f16 %rs16, %rs15, %rs1;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs2;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
@@ -43,50 +43,53 @@ define half @reduce_fadd_half(<8 x half> %in) {
}
define half @reduce_fadd_half_reassoc(<8 x half> %in) {
-; CHECK-SM80-LABEL: reduce_fadd_half_reassoc(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<6>;
-; CHECK-SM80-NEXT: .reg .b32 %r<10>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
-; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4;
-; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3;
-; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
-; CHECK-SM80-NEXT: // implicit-def: %rs2
-; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
-; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000;
-; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4;
-; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fadd_half_reassoc(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<6>;
-; CHECK-SM100-NEXT: .reg .b32 %r<10>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
-; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000;
-; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4;
-; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_fadd_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<18>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: add.rn.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: add.rn.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: mov.b16 %rs16, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs15, %rs16;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
+; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
ret half %res
}
define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fadd_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<16>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fadd_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fadd_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fadd_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: mov.b16 %rs8, 0x0000;
+; CHECK-O0-NEXT: add.rn.f16 %rs9, %rs1, %rs8;
+; CHECK-O0-NEXT: add.rn.f16 %rs10, %rs9, %rs2;
+; CHECK-O0-NEXT: add.rn.f16 %rs11, %rs10, %rs3;
+; CHECK-O0-NEXT: add.rn.f16 %rs12, %rs11, %rs4;
+; CHECK-O0-NEXT: add.rn.f16 %rs13, %rs12, %rs5;
+; CHECK-O0-NEXT: add.rn.f16 %rs14, %rs13, %rs6;
+; CHECK-O0-NEXT: add.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<16>;
@@ -113,6 +116,23 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fadd_float(<8 x float> %in) {
+; CHECK-O0-LABEL: reduce_fadd_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<17>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
+; CHECK-O0-NEXT: add.rn.f32 %r9, %r1, 0f00000000;
+; CHECK-O0-NEXT: add.rn.f32 %r10, %r9, %r2;
+; CHECK-O0-NEXT: add.rn.f32 %r11, %r10, %r3;
+; CHECK-O0-NEXT: add.rn.f32 %r12, %r11, %r4;
+; CHECK-O0-NEXT: add.rn.f32 %r13, %r12, %r5;
+; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, %r6;
+; CHECK-O0-NEXT: add.rn.f32 %r15, %r14, %r7;
+; CHECK-O0-NEXT: add.rn.f32 %r16, %r15, %r8;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r16;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<17>;
@@ -135,6 +155,23 @@ define float @reduce_fadd_float(<8 x float> %in) {
}
define float @reduce_fadd_float_reassoc(<8 x float> %in) {
+; CHECK-O0-LABEL: reduce_fadd_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<17>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-O0-NEXT: add.rn.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: add.rn.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: add.rn.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: add.rn.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: add.rn.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: add.rn.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: add.rn.f32 %r16, %r15, 0f00000000;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r16;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<17>;
@@ -142,13 +179,13 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-NEXT: add.rn.f32 %r9, %r3, %r7;
-; CHECK-NEXT: add.rn.f32 %r10, %r1, %r5;
-; CHECK-NEXT: add.rn.f32 %r11, %r4, %r8;
-; CHECK-NEXT: add.rn.f32 %r12, %r2, %r6;
-; CHECK-NEXT: add.rn.f32 %r13, %r12, %r11;
-; CHECK-NEXT: add.rn.f32 %r14, %r10, %r9;
-; CHECK-NEXT: add.rn.f32 %r15, %r14, %r13;
+; CHECK-NEXT: add.rn.f32 %r9, %r7, %r8;
+; CHECK-NEXT: add.rn.f32 %r10, %r5, %r6;
+; CHECK-NEXT: add.rn.f32 %r11, %r10, %r9;
+; CHECK-NEXT: add.rn.f32 %r12, %r3, %r4;
+; CHECK-NEXT: add.rn.f32 %r13, %r1, %r2;
+; CHECK-NEXT: add.rn.f32 %r14, %r13, %r12;
+; CHECK-NEXT: add.rn.f32 %r15, %r14, %r11;
; CHECK-NEXT: add.rn.f32 %r16, %r15, 0f00000000;
; CHECK-NEXT: st.param.b32 [func_retval0], %r16;
; CHECK-NEXT: ret;
@@ -157,6 +194,23 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
}
define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-O0-LABEL: reduce_fadd_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<15>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: add.rn.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: add.rn.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: add.rn.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: add.rn.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: add.rn.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: add.rn.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, 0f00000000;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r14;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<15>;
@@ -165,12 +219,12 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: add.rn.f32 %r8, %r3, %r7;
-; CHECK-NEXT: add.rn.f32 %r9, %r1, %r5;
-; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT: add.rn.f32 %r11, %r2, %r6;
-; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT: add.rn.f32 %r13, %r10, %r12;
+; CHECK-NEXT: add.rn.f32 %r8, %r5, %r6;
+; CHECK-NEXT: add.rn.f32 %r9, %r8, %r7;
+; CHECK-NEXT: add.rn.f32 %r10, %r3, %r4;
+; CHECK-NEXT: add.rn.f32 %r11, %r1, %r2;
+; CHECK-NEXT: add.rn.f32 %r12, %r11, %r10;
+; CHECK-NEXT: add.rn.f32 %r13, %r12, %r9;
; CHECK-NEXT: add.rn.f32 %r14, %r13, 0f00000000;
; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
; CHECK-NEXT: ret;
@@ -187,17 +241,17 @@ define half @reduce_fmul_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-NEXT: mul.rn.f16 %rs6, %rs3, %rs4;
-; CHECK-NEXT: mul.rn.f16 %rs7, %rs6, %rs5;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
-; CHECK-NEXT: mul.rn.f16 %rs10, %rs7, %rs8;
-; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs9;
-; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
-; CHECK-NEXT: mul.rn.f16 %rs14, %rs11, %rs12;
-; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs9, %rs7, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs5;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs6;
+; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs3;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs12, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs1;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs2;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
@@ -205,66 +259,66 @@ define half @reduce_fmul_half(<8 x half> %in) {
}
define half @reduce_fmul_half_reassoc(<8 x half> %in) {
-; CHECK-SM80-LABEL: reduce_fmul_half_reassoc(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM80-NEXT: .reg .b32 %r<10>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
-; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4;
-; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3;
-; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
-; CHECK-SM80-NEXT: // implicit-def: %rs2
-; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
-; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_fmul_half_reassoc(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<10>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
-; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_fmul_half_reassoc(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: mul.rn.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: mul.rn.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
+; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
ret half %res
}
define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fmul_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmul_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmul_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: mul.rn.f16 %rs8, %rs3, %rs4;
+; CHECK-O0-NEXT: mul.rn.f16 %rs9, %rs1, %rs2;
+; CHECK-O0-NEXT: mul.rn.f16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: mul.rn.f16 %rs11, %rs5, %rs6;
+; CHECK-O0-NEXT: mul.rn.f16 %rs12, %rs11, %rs7;
+; CHECK-O0-NEXT: mul.rn.f16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_half_reassoc_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmul_half_reassoc_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmul_half_reassoc_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0x3C00;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: mul.rn.f16x2 %r5, %r3, %r4;
-; CHECK-NEXT: mul.rn.f16x2 %r6, %r2, %r1;
-; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: mul.rn.f16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: mul.rn.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: mul.rn.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <7 x half> %in)
ret half %res
@@ -272,6 +326,22 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmul_float(<8 x float> %in) {
+; CHECK-O0-LABEL: reduce_fmul_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
+; CHECK-O0-NEXT: mul.rn.f32 %r9, %r1, %r2;
+; CHECK-O0-NEXT: mul.rn.f32 %r10, %r9, %r3;
+; CHECK-O0-NEXT: mul.rn.f32 %r11, %r10, %r4;
+; CHECK-O0-NEXT: mul.rn.f32 %r12, %r11, %r5;
+; CHECK-O0-NEXT: mul.rn.f32 %r13, %r12, %r6;
+; CHECK-O0-NEXT: mul.rn.f32 %r14, %r13, %r7;
+; CHECK-O0-NEXT: mul.rn.f32 %r15, %r14, %r8;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -293,6 +363,22 @@ define float @reduce_fmul_float(<8 x float> %in) {
}
define float @reduce_fmul_float_reassoc(<8 x float> %in) {
+; CHECK-O0-LABEL: reduce_fmul_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-O0-NEXT: mul.rn.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: mul.rn.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: mul.rn.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: mul.rn.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: mul.rn.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: mul.rn.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: mul.rn.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -300,13 +386,13 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-NEXT: mul.rn.f32 %r9, %r3, %r7;
-; CHECK-NEXT: mul.rn.f32 %r10, %r1, %r5;
-; CHECK-NEXT: mul.rn.f32 %r11, %r4, %r8;
-; CHECK-NEXT: mul.rn.f32 %r12, %r2, %r6;
-; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r11;
-; CHECK-NEXT: mul.rn.f32 %r14, %r10, %r9;
-; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r13;
+; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8;
+; CHECK-NEXT: mul.rn.f32 %r10, %r5, %r6;
+; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r9;
+; CHECK-NEXT: mul.rn.f32 %r12, %r3, %r4;
+; CHECK-NEXT: mul.rn.f32 %r13, %r1, %r2;
+; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r12;
+; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
@@ -314,6 +400,22 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
}
define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-O0-LABEL: reduce_fmul_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: mul.rn.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: mul.rn.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: mul.rn.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: mul.rn.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: mul.rn.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: mul.rn.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -322,12 +424,12 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mul.rn.f32 %r8, %r3, %r7;
-; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r5;
-; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT: mul.rn.f32 %r11, %r2, %r6;
-; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT: mul.rn.f32 %r13, %r10, %r12;
+; CHECK-NEXT: mul.rn.f32 %r8, %r5, %r6;
+; CHECK-NEXT: mul.rn.f32 %r9, %r8, %r7;
+; CHECK-NEXT: mul.rn.f32 %r10, %r3, %r4;
+; CHECK-NEXT: mul.rn.f32 %r11, %r1, %r2;
+; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r10;
+; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
@@ -357,43 +459,64 @@ define half @reduce_fmax_half(<8 x half> %in) {
define half @reduce_fmax_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmax_half_reassoc(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0];
-; CHECK-NEXT: max.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: max.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: max.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmax(<8 x half> %in)
ret half %res
}
define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fmax_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: max.f16 %rs8, %rs3, %rs4;
+; CHECK-O0-NEXT: max.f16 %rs9, %rs1, %rs2;
+; CHECK-O0-NEXT: max.f16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: max.f16 %rs11, %rs5, %rs6;
+; CHECK-O0-NEXT: max.f16 %rs12, %rs11, %rs7;
+; CHECK-O0-NEXT: max.f16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmax_half_reassoc_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0xFE00;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: max.f16x2 %r5, %r3, %r4;
-; CHECK-NEXT: max.f16x2 %r6, %r2, %r1;
-; CHECK-NEXT: max.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: max.f16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: max.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: max.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmax(<7 x half> %in)
ret half %res
@@ -402,6 +525,22 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmax_float(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmax_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
+; CHECK-O0-NEXT: max.f32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: max.f32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: max.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.f32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: max.f32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: max.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmax_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -424,6 +563,22 @@ define float @reduce_fmax_float(<8 x float> %in) {
define float @reduce_fmax_float_reassoc(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmax_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-O0-NEXT: max.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: max.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: max.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: max.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: max.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmax_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -431,11 +586,11 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT: max.f32 %r9, %r4, %r8;
-; CHECK-NEXT: max.f32 %r10, %r2, %r6;
+; CHECK-NEXT: max.f32 %r9, %r7, %r8;
+; CHECK-NEXT: max.f32 %r10, %r5, %r6;
; CHECK-NEXT: max.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.f32 %r12, %r3, %r7;
-; CHECK-NEXT: max.f32 %r13, %r1, %r5;
+; CHECK-NEXT: max.f32 %r12, %r3, %r4;
+; CHECK-NEXT: max.f32 %r13, %r1, %r2;
; CHECK-NEXT: max.f32 %r14, %r13, %r12;
; CHECK-NEXT: max.f32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -446,6 +601,22 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmax_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: max.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: max.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: max.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: max.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: max.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: max.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -454,12 +625,12 @@ define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: max.f32 %r8, %r3, %r7;
-; CHECK-NEXT: max.f32 %r9, %r1, %r5;
-; CHECK-NEXT: max.f32 %r10, %r9, %r8;
-; CHECK-NEXT: max.f32 %r11, %r2, %r6;
-; CHECK-NEXT: max.f32 %r12, %r11, %r4;
-; CHECK-NEXT: max.f32 %r13, %r10, %r12;
+; CHECK-NEXT: max.f32 %r8, %r5, %r6;
+; CHECK-NEXT: max.f32 %r9, %r8, %r7;
+; CHECK-NEXT: max.f32 %r10, %r3, %r4;
+; CHECK-NEXT: max.f32 %r11, %r1, %r2;
+; CHECK-NEXT: max.f32 %r12, %r11, %r10;
+; CHECK-NEXT: max.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
@@ -489,43 +660,64 @@ define half @reduce_fmin_half(<8 x half> %in) {
define half @reduce_fmin_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmin_half_reassoc(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0];
-; CHECK-NEXT: min.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: min.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: min.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmin(<8 x half> %in)
ret half %res
}
define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fmin_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmin_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: min.f16 %rs8, %rs3, %rs4;
+; CHECK-O0-NEXT: min.f16 %rs9, %rs1, %rs2;
+; CHECK-O0-NEXT: min.f16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: min.f16 %rs11, %rs5, %rs6;
+; CHECK-O0-NEXT: min.f16 %rs12, %rs11, %rs7;
+; CHECK-O0-NEXT: min.f16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmin_half_reassoc_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmin_half_reassoc_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_reassoc_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0x7E00;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: min.f16x2 %r5, %r3, %r4;
-; CHECK-NEXT: min.f16x2 %r6, %r2, %r1;
-; CHECK-NEXT: min.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: min.f16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: min.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: min.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmin(<7 x half> %in)
ret half %res
@@ -534,6 +726,22 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmin_float(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmin_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
+; CHECK-O0-NEXT: min.f32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: min.f32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: min.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.f32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: min.f32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: min.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmin_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -556,6 +764,22 @@ define float @reduce_fmin_float(<8 x float> %in) {
define float @reduce_fmin_float_reassoc(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmin_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-O0-NEXT: min.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: min.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: min.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: min.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: min.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmin_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -563,11 +787,11 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT: min.f32 %r9, %r4, %r8;
-; CHECK-NEXT: min.f32 %r10, %r2, %r6;
+; CHECK-NEXT: min.f32 %r9, %r7, %r8;
+; CHECK-NEXT: min.f32 %r10, %r5, %r6;
; CHECK-NEXT: min.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.f32 %r12, %r3, %r7;
-; CHECK-NEXT: min.f32 %r13, %r1, %r5;
+; CHECK-NEXT: min.f32 %r12, %r3, %r4;
+; CHECK-NEXT: min.f32 %r13, %r1, %r2;
; CHECK-NEXT: min.f32 %r14, %r13, %r12;
; CHECK-NEXT: min.f32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -578,6 +802,22 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmin_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: min.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: min.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: min.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: min.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: min.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: min.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmin_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -586,12 +826,12 @@ define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: min.f32 %r8, %r3, %r7;
-; CHECK-NEXT: min.f32 %r9, %r1, %r5;
-; CHECK-NEXT: min.f32 %r10, %r9, %r8;
-; CHECK-NEXT: min.f32 %r11, %r2, %r6;
-; CHECK-NEXT: min.f32 %r12, %r11, %r4;
-; CHECK-NEXT: min.f32 %r13, %r10, %r12;
+; CHECK-NEXT: min.f32 %r8, %r5, %r6;
+; CHECK-NEXT: min.f32 %r9, %r8, %r7;
+; CHECK-NEXT: min.f32 %r10, %r3, %r4;
+; CHECK-NEXT: min.f32 %r11, %r1, %r2;
+; CHECK-NEXT: min.f32 %r12, %r11, %r10;
+; CHECK-NEXT: min.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in)
@@ -621,43 +861,64 @@ define half @reduce_fmaximum_half(<8 x half> %in) {
define half @reduce_fmaximum_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmaximum_half_reassoc(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0];
-; CHECK-NEXT: max.NaN.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: max.NaN.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: max.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: max.NaN.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmaximum(<8 x half> %in)
ret half %res
}
define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fmaximum_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmaximum_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmaximum_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmaximum_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: max.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-O0-NEXT: max.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-O0-NEXT: max.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: max.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-O0-NEXT: max.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-O0-NEXT: max.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmaximum_half_reassoc_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmaximum_half_reassoc_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmaximum_half_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmaximum_half_reassoc_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0xFC00;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: max.NaN.f16x2 %r5, %r3, %r4;
-; CHECK-NEXT: max.NaN.f16x2 %r6, %r2, %r1;
-; CHECK-NEXT: max.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: max.NaN.f16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmaximum_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: max.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: max.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fmaximum(<7 x half> %in)
ret half %res
@@ -666,6 +927,22 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmaximum_float(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmaximum_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
+; CHECK-O0-NEXT: max.NaN.f32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: max.NaN.f32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: max.NaN.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.NaN.f32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: max.NaN.f32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: max.NaN.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.NaN.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmaximum_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -688,6 +965,22 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmaximum_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-O0-NEXT: max.NaN.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: max.NaN.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: max.NaN.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.NaN.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: max.NaN.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: max.NaN.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.NaN.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmaximum_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -695,11 +988,11 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT: max.NaN.f32 %r9, %r7, %r8;
+; CHECK-NEXT: max.NaN.f32 %r10, %r5, %r6;
; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r4;
+; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r2;
; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12;
; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -710,6 +1003,22 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: max.NaN.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: max.NaN.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: max.NaN.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: max.NaN.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: max.NaN.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: max.NaN.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -718,12 +1027,12 @@ define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: max.NaN.f32 %r8, %r3, %r7;
-; CHECK-NEXT: max.NaN.f32 %r9, %r1, %r5;
-; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r8;
-; CHECK-NEXT: max.NaN.f32 %r11, %r2, %r6;
-; CHECK-NEXT: max.NaN.f32 %r12, %r11, %r4;
-; CHECK-NEXT: max.NaN.f32 %r13, %r10, %r12;
+; CHECK-NEXT: max.NaN.f32 %r8, %r5, %r6;
+; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r7;
+; CHECK-NEXT: max.NaN.f32 %r10, %r3, %r4;
+; CHECK-NEXT: max.NaN.f32 %r11, %r1, %r2;
+; CHECK-NEXT: max.NaN.f32 %r12, %r11, %r10;
+; CHECK-NEXT: max.NaN.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in)
@@ -753,43 +1062,64 @@ define half @reduce_fminimum_half(<8 x half> %in) {
define half @reduce_fminimum_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fminimum_half_reassoc(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0];
-; CHECK-NEXT: min.NaN.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: min.NaN.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: min.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: min.NaN.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fminimum(<8 x half> %in)
ret half %res
}
define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) {
+; CHECK-O0-LABEL: reduce_fminimum_half_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fminimum_half_reassoc_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fminimum_half_reassoc_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fminimum_half_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: min.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-O0-NEXT: min.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-O0-NEXT: min.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: min.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-O0-NEXT: min.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-O0-NEXT: min.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fminimum_half_reassoc_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_fminimum_half_reassoc_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fminimum_half_reassoc_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fminimum_half_reassoc_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0x7C00;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: min.NaN.f16x2 %r5, %r3, %r4;
-; CHECK-NEXT: min.NaN.f16x2 %r6, %r2, %r1;
-; CHECK-NEXT: min.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: min.NaN.f16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fminimum_half_reassoc_nonpow2_param_0];
+; CHECK-NEXT: min.NaN.f16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.NaN.f16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.NaN.f16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.NaN.f16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: min.NaN.f16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs13;
; CHECK-NEXT: ret;
%res = call reassoc half @llvm.vector.reduce.fminimum(<7 x half> %in)
ret half %res
@@ -798,6 +1128,22 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fminimum_float(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fminimum_float(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
+; CHECK-O0-NEXT: min.NaN.f32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: min.NaN.f32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: min.NaN.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.NaN.f32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: min.NaN.f32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: min.NaN.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.NaN.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fminimum_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -820,6 +1166,22 @@ define float @reduce_fminimum_float(<8 x float> %in) {
define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fminimum_float_reassoc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-O0-NEXT: min.NaN.f32 %r9, %r7, %r8;
+; CHECK-O0-NEXT: min.NaN.f32 %r10, %r5, %r6;
+; CHECK-O0-NEXT: min.NaN.f32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.NaN.f32 %r12, %r3, %r4;
+; CHECK-O0-NEXT: min.NaN.f32 %r13, %r1, %r2;
+; CHECK-O0-NEXT: min.NaN.f32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.NaN.f32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fminimum_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -827,11 +1189,11 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT: min.NaN.f32 %r9, %r7, %r8;
+; CHECK-NEXT: min.NaN.f32 %r10, %r5, %r6;
; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r4;
+; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r2;
; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12;
; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -842,6 +1204,22 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
;
+; CHECK-O0-LABEL: reduce_fminimum_float_reassoc_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
+; CHECK-O0-NEXT: min.NaN.f32 %r8, %r5, %r6;
+; CHECK-O0-NEXT: min.NaN.f32 %r9, %r8, %r7;
+; CHECK-O0-NEXT: min.NaN.f32 %r10, %r3, %r4;
+; CHECK-O0-NEXT: min.NaN.f32 %r11, %r1, %r2;
+; CHECK-O0-NEXT: min.NaN.f32 %r12, %r11, %r10;
+; CHECK-O0-NEXT: min.NaN.f32 %r13, %r12, %r9;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -850,12 +1228,12 @@ define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: min.NaN.f32 %r8, %r3, %r7;
-; CHECK-NEXT: min.NaN.f32 %r9, %r1, %r5;
-; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r8;
-; CHECK-NEXT: min.NaN.f32 %r11, %r2, %r6;
-; CHECK-NEXT: min.NaN.f32 %r12, %r11, %r4;
-; CHECK-NEXT: min.NaN.f32 %r13, %r10, %r12;
+; CHECK-NEXT: min.NaN.f32 %r8, %r5, %r6;
+; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r7;
+; CHECK-NEXT: min.NaN.f32 %r10, %r3, %r4;
+; CHECK-NEXT: min.NaN.f32 %r11, %r1, %r2;
+; CHECK-NEXT: min.NaN.f32 %r12, %r11, %r10;
+; CHECK-NEXT: min.NaN.f32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
@@ -863,6 +1241,7 @@ define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
}
define i16 @reduce_add_i16(<8 x i16> %in) {
+;
; CHECK-SM80-LABEL: reduce_add_i16(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
@@ -872,15 +1251,15 @@ define i16 @reduce_add_i16(<8 x i16> %in) {
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: add.s16 %rs5, %rs3, %rs1;
+; CHECK-SM80-NEXT: add.s16 %rs5, %rs4, %rs2;
; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: add.s16 %rs10, %rs8, %rs6;
-; CHECK-SM80-NEXT: add.s16 %rs11, %rs4, %rs2;
-; CHECK-SM80-NEXT: add.s16 %rs12, %rs9, %rs7;
-; CHECK-SM80-NEXT: add.s16 %rs13, %rs12, %rs11;
-; CHECK-SM80-NEXT: add.s16 %rs14, %rs10, %rs5;
-; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs13;
+; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs7;
+; CHECK-SM80-NEXT: add.s16 %rs11, %rs10, %rs5;
+; CHECK-SM80-NEXT: add.s16 %rs12, %rs3, %rs1;
+; CHECK-SM80-NEXT: add.s16 %rs13, %rs8, %rs6;
+; CHECK-SM80-NEXT: add.s16 %rs14, %rs13, %rs12;
+; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs11;
; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-SM80-NEXT: ret;
@@ -888,26 +1267,67 @@ define i16 @reduce_add_i16(<8 x i16> %in) {
; CHECK-SM100-LABEL: reduce_add_i16(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-NEXT: .reg .b32 %r<9>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4;
; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3;
; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-SM100-NEXT: ret;
%res = call i16 @llvm.vector.reduce.add(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
+;
+;
+; CHECK-SM80-O0-LABEL: reduce_add_i16_nonpow2(
+; CHECK-SM80-O0: {
+; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-O0-EMPTY:
+; CHECK-SM80-O0-NEXT: // %bb.0:
+; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-SM80-O0-NEXT: add.s16 %rs8, %rs3, %rs7;
+; CHECK-SM80-O0-NEXT: add.s16 %rs9, %rs1, %rs5;
+; CHECK-SM80-O0-NEXT: add.s16 %rs10, %rs9, %rs8;
+; CHECK-SM80-O0-NEXT: add.s16 %rs11, %rs2, %rs6;
+; CHECK-SM80-O0-NEXT: add.s16 %rs12, %rs11, %rs4;
+; CHECK-SM80-O0-NEXT: add.s16 %rs13, %rs10, %rs12;
+; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-O0-NEXT: ret;
+;
+; CHECK-SM100-O0-LABEL: reduce_add_i16_nonpow2(
+; CHECK-SM100-O0: {
+; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-O0-EMPTY:
+; CHECK-SM100-O0-NEXT: // %bb.0:
+; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 0;
+; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-SM100-O0-NEXT: add.s16x2 %r5, %r3, %r4;
+; CHECK-SM100-O0-NEXT: add.s16x2 %r6, %r2, %r1;
+; CHECK-SM100-O0-NEXT: add.s16x2 %r7, %r6, %r5;
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-SM100-O0-NEXT: add.s16 %rs11, %rs9, %rs10;
+; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-O0-NEXT: ret;
; CHECK-SM80-LABEL: reduce_add_i16_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
@@ -955,6 +1375,22 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_add_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_add_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
+; CHECK-O0-NEXT: add.s32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: add.s32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: add.s32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: add.s32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: add.s32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: add.s32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_add_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -962,13 +1398,13 @@ define i32 @reduce_add_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
-; CHECK-NEXT: add.s32 %r9, %r3, %r7;
-; CHECK-NEXT: add.s32 %r10, %r1, %r5;
-; CHECK-NEXT: add.s32 %r11, %r4, %r8;
-; CHECK-NEXT: add.s32 %r12, %r2, %r6;
-; CHECK-NEXT: add.s32 %r13, %r12, %r11;
-; CHECK-NEXT: add.s32 %r14, %r10, %r9;
-; CHECK-NEXT: add.s32 %r15, %r14, %r13;
+; CHECK-NEXT: add.s32 %r9, %r4, %r8;
+; CHECK-NEXT: add.s32 %r10, %r2, %r6;
+; CHECK-NEXT: add.s32 %r11, %r10, %r9;
+; CHECK-NEXT: add.s32 %r12, %r3, %r7;
+; CHECK-NEXT: add.s32 %r13, %r1, %r5;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.add(<8 x i32> %in)
@@ -976,6 +1412,22 @@ define i32 @reduce_add_i32(<8 x i32> %in) {
}
define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_add_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: add.s32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: add.s32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: add.s32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: add.s32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: add.s32 %r12, %r11, %r4;
+; CHECK-O0-NEXT: add.s32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_add_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1006,15 +1458,15 @@ define i16 @reduce_mul_i16(<8 x i16> %in) {
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NEXT: mul.lo.s16 %rs5, %rs3, %rs1;
+; CHECK-NEXT: mul.lo.s16 %rs5, %rs4, %rs2;
; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-NEXT: mul.lo.s16 %rs10, %rs8, %rs6;
-; CHECK-NEXT: mul.lo.s16 %rs11, %rs4, %rs2;
-; CHECK-NEXT: mul.lo.s16 %rs12, %rs9, %rs7;
-; CHECK-NEXT: mul.lo.s16 %rs13, %rs12, %rs11;
-; CHECK-NEXT: mul.lo.s16 %rs14, %rs10, %rs5;
-; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs7;
+; CHECK-NEXT: mul.lo.s16 %rs11, %rs10, %rs5;
+; CHECK-NEXT: mul.lo.s16 %rs12, %rs3, %rs1;
+; CHECK-NEXT: mul.lo.s16 %rs13, %rs8, %rs6;
+; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs12;
+; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs11;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
@@ -1023,6 +1475,25 @@ define i16 @reduce_mul_i16(<8 x i16> %in) {
}
define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
+; CHECK-O0-LABEL: reduce_mul_i16_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_mul_i16_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
+; CHECK-O0-NEXT: mul.lo.s16 %rs8, %rs3, %rs7;
+; CHECK-O0-NEXT: mul.lo.s16 %rs9, %rs1, %rs5;
+; CHECK-O0-NEXT: mul.lo.s16 %rs10, %rs9, %rs8;
+; CHECK-O0-NEXT: mul.lo.s16 %rs11, %rs2, %rs6;
+; CHECK-O0-NEXT: mul.lo.s16 %rs12, %rs4, %rs11;
+; CHECK-O0-NEXT: mul.lo.s16 %rs13, %rs10, %rs12;
+; CHECK-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i16_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -1047,6 +1518,22 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_mul_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_mul_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
+; CHECK-O0-NEXT: mul.lo.s32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: mul.lo.s32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: mul.lo.s32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: mul.lo.s32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: mul.lo.s32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: mul.lo.s32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: mul.lo.s32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1054,13 +1541,13 @@ define i32 @reduce_mul_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
-; CHECK-NEXT: mul.lo.s32 %r9, %r3, %r7;
-; CHECK-NEXT: mul.lo.s32 %r10, %r1, %r5;
-; CHECK-NEXT: mul.lo.s32 %r11, %r4, %r8;
-; CHECK-NEXT: mul.lo.s32 %r12, %r2, %r6;
-; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r11;
-; CHECK-NEXT: mul.lo.s32 %r14, %r10, %r9;
-; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r13;
+; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8;
+; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6;
+; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9;
+; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7;
+; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5;
+; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12;
+; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in)
@@ -1068,6 +1555,22 @@ define i32 @reduce_mul_i32(<8 x i32> %in) {
}
define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_mul_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: mul.lo.s32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: mul.lo.s32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: mul.lo.s32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: mul.lo.s32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: mul.lo.s32 %r12, %r4, %r11;
+; CHECK-O0-NEXT: mul.lo.s32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1089,6 +1592,7 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_umax_i16(<8 x i16> %in) {
+;
; CHECK-SM80-LABEL: reduce_umax_i16(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
@@ -1098,15 +1602,15 @@ define i16 @reduce_umax_i16(<8 x i16> %in) {
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: max.u16 %rs5, %rs3, %rs1;
+; CHECK-SM80-NEXT: max.u16 %rs5, %rs4, %rs2;
; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: max.u16 %rs10, %rs8, %rs6;
-; CHECK-SM80-NEXT: max.u16 %rs11, %rs4, %rs2;
-; CHECK-SM80-NEXT: max.u16 %rs12, %rs9, %rs7;
-; CHECK-SM80-NEXT: max.u16 %rs13, %rs12, %rs11;
-; CHECK-SM80-NEXT: max.u16 %rs14, %rs10, %rs5;
-; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs13;
+; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs7;
+; CHECK-SM80-NEXT: max.u16 %rs11, %rs10, %rs5;
+; CHECK-SM80-NEXT: max.u16 %rs12, %rs3, %rs1;
+; CHECK-SM80-NEXT: max.u16 %rs13, %rs8, %rs6;
+; CHECK-SM80-NEXT: max.u16 %rs14, %rs13, %rs12;
+; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs11;
; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-SM80-NEXT: ret;
@@ -1114,26 +1618,67 @@ define i16 @reduce_umax_i16(<8 x i16> %in) {
; CHECK-SM100-LABEL: reduce_umax_i16(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-NEXT: .reg .b32 %r<9>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4;
; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3;
; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-SM100-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
+;
+;
+; CHECK-SM80-O0-LABEL: reduce_umax_i16_nonpow2(
+; CHECK-SM80-O0: {
+; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-O0-EMPTY:
+; CHECK-SM80-O0-NEXT: // %bb.0:
+; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-SM80-O0-NEXT: max.u16 %rs8, %rs3, %rs7;
+; CHECK-SM80-O0-NEXT: max.u16 %rs9, %rs1, %rs5;
+; CHECK-SM80-O0-NEXT: max.u16 %rs10, %rs9, %rs8;
+; CHECK-SM80-O0-NEXT: max.u16 %rs11, %rs2, %rs6;
+; CHECK-SM80-O0-NEXT: max.u16 %rs12, %rs4, %rs11;
+; CHECK-SM80-O0-NEXT: max.u16 %rs13, %rs10, %rs12;
+; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-O0-NEXT: ret;
+;
+; CHECK-SM100-O0-LABEL: reduce_umax_i16_nonpow2(
+; CHECK-SM100-O0: {
+; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-O0-EMPTY:
+; CHECK-SM100-O0-NEXT: // %bb.0:
+; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 0;
+; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-SM100-O0-NEXT: max.u16x2 %r5, %r3, %r4;
+; CHECK-SM100-O0-NEXT: max.u16x2 %r6, %r2, %r1;
+; CHECK-SM100-O0-NEXT: max.u16x2 %r7, %r6, %r5;
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-SM100-O0-NEXT: max.u16 %rs11, %rs9, %rs10;
+; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-O0-NEXT: ret;
; CHECK-SM80-LABEL: reduce_umax_i16_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
@@ -1181,6 +1726,22 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_umax_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_umax_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
+; CHECK-O0-NEXT: max.u32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: max.u32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: max.u32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.u32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: max.u32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: max.u32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.u32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umax_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1188,13 +1749,13 @@ define i32 @reduce_umax_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
-; CHECK-NEXT: max.u32 %r9, %r3, %r7;
-; CHECK-NEXT: max.u32 %r10, %r1, %r5;
-; CHECK-NEXT: max.u32 %r11, %r4, %r8;
-; CHECK-NEXT: max.u32 %r12, %r2, %r6;
-; CHECK-NEXT: max.u32 %r13, %r12, %r11;
-; CHECK-NEXT: max.u32 %r14, %r10, %r9;
-; CHECK-NEXT: max.u32 %r15, %r14, %r13;
+; CHECK-NEXT: max.u32 %r9, %r4, %r8;
+; CHECK-NEXT: max.u32 %r10, %r2, %r6;
+; CHECK-NEXT: max.u32 %r11, %r10, %r9;
+; CHECK-NEXT: max.u32 %r12, %r3, %r7;
+; CHECK-NEXT: max.u32 %r13, %r1, %r5;
+; CHECK-NEXT: max.u32 %r14, %r13, %r12;
+; CHECK-NEXT: max.u32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in)
@@ -1202,6 +1763,22 @@ define i32 @reduce_umax_i32(<8 x i32> %in) {
}
define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_umax_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: max.u32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: max.u32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: max.u32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: max.u32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: max.u32 %r12, %r4, %r11;
+; CHECK-O0-NEXT: max.u32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umax_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1223,6 +1800,7 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_umin_i16(<8 x i16> %in) {
+;
; CHECK-SM80-LABEL: reduce_umin_i16(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
@@ -1232,15 +1810,15 @@ define i16 @reduce_umin_i16(<8 x i16> %in) {
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: min.u16 %rs5, %rs3, %rs1;
+; CHECK-SM80-NEXT: min.u16 %rs5, %rs4, %rs2;
; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: min.u16 %rs10, %rs8, %rs6;
-; CHECK-SM80-NEXT: min.u16 %rs11, %rs4, %rs2;
-; CHECK-SM80-NEXT: min.u16 %rs12, %rs9, %rs7;
-; CHECK-SM80-NEXT: min.u16 %rs13, %rs12, %rs11;
-; CHECK-SM80-NEXT: min.u16 %rs14, %rs10, %rs5;
-; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs13;
+; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs7;
+; CHECK-SM80-NEXT: min.u16 %rs11, %rs10, %rs5;
+; CHECK-SM80-NEXT: min.u16 %rs12, %rs3, %rs1;
+; CHECK-SM80-NEXT: min.u16 %rs13, %rs8, %rs6;
+; CHECK-SM80-NEXT: min.u16 %rs14, %rs13, %rs12;
+; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs11;
; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-SM80-NEXT: ret;
@@ -1248,26 +1826,67 @@ define i16 @reduce_umin_i16(<8 x i16> %in) {
; CHECK-SM100-LABEL: reduce_umin_i16(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-NEXT: .reg .b32 %r<9>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4;
; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3;
; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-SM100-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
+;
+;
+; CHECK-SM80-O0-LABEL: reduce_umin_i16_nonpow2(
+; CHECK-SM80-O0: {
+; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-O0-EMPTY:
+; CHECK-SM80-O0-NEXT: // %bb.0:
+; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-SM80-O0-NEXT: min.u16 %rs8, %rs3, %rs7;
+; CHECK-SM80-O0-NEXT: min.u16 %rs9, %rs1, %rs5;
+; CHECK-SM80-O0-NEXT: min.u16 %rs10, %rs9, %rs8;
+; CHECK-SM80-O0-NEXT: min.u16 %rs11, %rs2, %rs6;
+; CHECK-SM80-O0-NEXT: min.u16 %rs12, %rs4, %rs11;
+; CHECK-SM80-O0-NEXT: min.u16 %rs13, %rs10, %rs12;
+; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-O0-NEXT: ret;
+;
+; CHECK-SM100-O0-LABEL: reduce_umin_i16_nonpow2(
+; CHECK-SM100-O0: {
+; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-O0-EMPTY:
+; CHECK-SM100-O0-NEXT: // %bb.0:
+; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-SM100-O0-NEXT: mov.b16 %rs8, -1;
+; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-SM100-O0-NEXT: min.u16x2 %r5, %r3, %r4;
+; CHECK-SM100-O0-NEXT: min.u16x2 %r6, %r2, %r1;
+; CHECK-SM100-O0-NEXT: min.u16x2 %r7, %r6, %r5;
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-SM100-O0-NEXT: min.u16 %rs11, %rs9, %rs10;
+; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-O0-NEXT: ret;
; CHECK-SM80-LABEL: reduce_umin_i16_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
@@ -1315,6 +1934,22 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_umin_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_umin_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
+; CHECK-O0-NEXT: min.u32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: min.u32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: min.u32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.u32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: min.u32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: min.u32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.u32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umin_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1322,13 +1957,13 @@ define i32 @reduce_umin_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
-; CHECK-NEXT: min.u32 %r9, %r3, %r7;
-; CHECK-NEXT: min.u32 %r10, %r1, %r5;
-; CHECK-NEXT: min.u32 %r11, %r4, %r8;
-; CHECK-NEXT: min.u32 %r12, %r2, %r6;
-; CHECK-NEXT: min.u32 %r13, %r12, %r11;
-; CHECK-NEXT: min.u32 %r14, %r10, %r9;
-; CHECK-NEXT: min.u32 %r15, %r14, %r13;
+; CHECK-NEXT: min.u32 %r9, %r4, %r8;
+; CHECK-NEXT: min.u32 %r10, %r2, %r6;
+; CHECK-NEXT: min.u32 %r11, %r10, %r9;
+; CHECK-NEXT: min.u32 %r12, %r3, %r7;
+; CHECK-NEXT: min.u32 %r13, %r1, %r5;
+; CHECK-NEXT: min.u32 %r14, %r13, %r12;
+; CHECK-NEXT: min.u32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in)
@@ -1336,6 +1971,22 @@ define i32 @reduce_umin_i32(<8 x i32> %in) {
}
define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_umin_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: min.u32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: min.u32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: min.u32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: min.u32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: min.u32 %r12, %r4, %r11;
+; CHECK-O0-NEXT: min.u32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umin_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1357,6 +2008,7 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_smax_i16(<8 x i16> %in) {
+;
; CHECK-SM80-LABEL: reduce_smax_i16(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
@@ -1366,15 +2018,15 @@ define i16 @reduce_smax_i16(<8 x i16> %in) {
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: max.s16 %rs5, %rs3, %rs1;
+; CHECK-SM80-NEXT: max.s16 %rs5, %rs4, %rs2;
; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: max.s16 %rs10, %rs8, %rs6;
-; CHECK-SM80-NEXT: max.s16 %rs11, %rs4, %rs2;
-; CHECK-SM80-NEXT: max.s16 %rs12, %rs9, %rs7;
-; CHECK-SM80-NEXT: max.s16 %rs13, %rs12, %rs11;
-; CHECK-SM80-NEXT: max.s16 %rs14, %rs10, %rs5;
-; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs13;
+; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs7;
+; CHECK-SM80-NEXT: max.s16 %rs11, %rs10, %rs5;
+; CHECK-SM80-NEXT: max.s16 %rs12, %rs3, %rs1;
+; CHECK-SM80-NEXT: max.s16 %rs13, %rs8, %rs6;
+; CHECK-SM80-NEXT: max.s16 %rs14, %rs13, %rs12;
+; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs11;
; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-SM80-NEXT: ret;
@@ -1382,26 +2034,67 @@ define i16 @reduce_smax_i16(<8 x i16> %in) {
; CHECK-SM100-LABEL: reduce_smax_i16(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-NEXT: .reg .b32 %r<9>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4;
; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3;
; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-SM100-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
+;
+;
+; CHECK-SM80-O0-LABEL: reduce_smax_i16_nonpow2(
+; CHECK-SM80-O0: {
+; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-O0-EMPTY:
+; CHECK-SM80-O0-NEXT: // %bb.0:
+; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-SM80-O0-NEXT: max.s16 %rs8, %rs3, %rs7;
+; CHECK-SM80-O0-NEXT: max.s16 %rs9, %rs1, %rs5;
+; CHECK-SM80-O0-NEXT: max.s16 %rs10, %rs9, %rs8;
+; CHECK-SM80-O0-NEXT: max.s16 %rs11, %rs2, %rs6;
+; CHECK-SM80-O0-NEXT: max.s16 %rs12, %rs4, %rs11;
+; CHECK-SM80-O0-NEXT: max.s16 %rs13, %rs10, %rs12;
+; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-O0-NEXT: ret;
+;
+; CHECK-SM100-O0-LABEL: reduce_smax_i16_nonpow2(
+; CHECK-SM100-O0: {
+; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-O0-EMPTY:
+; CHECK-SM100-O0-NEXT: // %bb.0:
+; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-SM100-O0-NEXT: mov.b16 %rs8, -32768;
+; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-SM100-O0-NEXT: max.s16x2 %r5, %r3, %r4;
+; CHECK-SM100-O0-NEXT: max.s16x2 %r6, %r2, %r1;
+; CHECK-SM100-O0-NEXT: max.s16x2 %r7, %r6, %r5;
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-SM100-O0-NEXT: max.s16 %rs11, %rs9, %rs10;
+; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-O0-NEXT: ret;
; CHECK-SM80-LABEL: reduce_smax_i16_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
@@ -1449,6 +2142,22 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_smax_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_smax_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
+; CHECK-O0-NEXT: max.s32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: max.s32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: max.s32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: max.s32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: max.s32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: max.s32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: max.s32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smax_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1456,13 +2165,13 @@ define i32 @reduce_smax_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
-; CHECK-NEXT: max.s32 %r9, %r3, %r7;
-; CHECK-NEXT: max.s32 %r10, %r1, %r5;
-; CHECK-NEXT: max.s32 %r11, %r4, %r8;
-; CHECK-NEXT: max.s32 %r12, %r2, %r6;
-; CHECK-NEXT: max.s32 %r13, %r12, %r11;
-; CHECK-NEXT: max.s32 %r14, %r10, %r9;
-; CHECK-NEXT: max.s32 %r15, %r14, %r13;
+; CHECK-NEXT: max.s32 %r9, %r4, %r8;
+; CHECK-NEXT: max.s32 %r10, %r2, %r6;
+; CHECK-NEXT: max.s32 %r11, %r10, %r9;
+; CHECK-NEXT: max.s32 %r12, %r3, %r7;
+; CHECK-NEXT: max.s32 %r13, %r1, %r5;
+; CHECK-NEXT: max.s32 %r14, %r13, %r12;
+; CHECK-NEXT: max.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in)
@@ -1470,6 +2179,22 @@ define i32 @reduce_smax_i32(<8 x i32> %in) {
}
define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_smax_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: max.s32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: max.s32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: max.s32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: max.s32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: max.s32 %r12, %r4, %r11;
+; CHECK-O0-NEXT: max.s32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smax_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1491,6 +2216,7 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_smin_i16(<8 x i16> %in) {
+;
; CHECK-SM80-LABEL: reduce_smin_i16(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
@@ -1500,15 +2226,15 @@ define i16 @reduce_smin_i16(<8 x i16> %in) {
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: min.s16 %rs5, %rs3, %rs1;
+; CHECK-SM80-NEXT: min.s16 %rs5, %rs4, %rs2;
; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: min.s16 %rs10, %rs8, %rs6;
-; CHECK-SM80-NEXT: min.s16 %rs11, %rs4, %rs2;
-; CHECK-SM80-NEXT: min.s16 %rs12, %rs9, %rs7;
-; CHECK-SM80-NEXT: min.s16 %rs13, %rs12, %rs11;
-; CHECK-SM80-NEXT: min.s16 %rs14, %rs10, %rs5;
-; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs13;
+; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs7;
+; CHECK-SM80-NEXT: min.s16 %rs11, %rs10, %rs5;
+; CHECK-SM80-NEXT: min.s16 %rs12, %rs3, %rs1;
+; CHECK-SM80-NEXT: min.s16 %rs13, %rs8, %rs6;
+; CHECK-SM80-NEXT: min.s16 %rs14, %rs13, %rs12;
+; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs11;
; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-SM80-NEXT: ret;
@@ -1516,26 +2242,67 @@ define i16 @reduce_smin_i16(<8 x i16> %in) {
; CHECK-SM100-LABEL: reduce_smin_i16(
; CHECK-SM100: {
; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-NEXT: .reg .b32 %r<9>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4;
; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3;
; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-SM100-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
+;
+;
+; CHECK-SM80-O0-LABEL: reduce_smin_i16_nonpow2(
+; CHECK-SM80-O0: {
+; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
+; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-O0-EMPTY:
+; CHECK-SM80-O0-NEXT: // %bb.0:
+; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-SM80-O0-NEXT: min.s16 %rs8, %rs3, %rs7;
+; CHECK-SM80-O0-NEXT: min.s16 %rs9, %rs1, %rs5;
+; CHECK-SM80-O0-NEXT: min.s16 %rs10, %rs9, %rs8;
+; CHECK-SM80-O0-NEXT: min.s16 %rs11, %rs2, %rs6;
+; CHECK-SM80-O0-NEXT: min.s16 %rs12, %rs4, %rs11;
+; CHECK-SM80-O0-NEXT: min.s16 %rs13, %rs10, %rs12;
+; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-O0-NEXT: ret;
+;
+; CHECK-SM100-O0-LABEL: reduce_smin_i16_nonpow2(
+; CHECK-SM100-O0: {
+; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-O0-EMPTY:
+; CHECK-SM100-O0-NEXT: // %bb.0:
+; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 32767;
+; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-SM100-O0-NEXT: min.s16x2 %r5, %r3, %r4;
+; CHECK-SM100-O0-NEXT: min.s16x2 %r6, %r2, %r1;
+; CHECK-SM100-O0-NEXT: min.s16x2 %r7, %r6, %r5;
+; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-SM100-O0-NEXT: min.s16 %rs11, %rs9, %rs10;
+; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-O0-NEXT: ret;
; CHECK-SM80-LABEL: reduce_smin_i16_nonpow2(
; CHECK-SM80: {
; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
@@ -1583,6 +2350,22 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_smin_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_smin_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
+; CHECK-O0-NEXT: min.s32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: min.s32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: min.s32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: min.s32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: min.s32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: min.s32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: min.s32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smin_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1590,13 +2373,13 @@ define i32 @reduce_smin_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
-; CHECK-NEXT: min.s32 %r9, %r3, %r7;
-; CHECK-NEXT: min.s32 %r10, %r1, %r5;
-; CHECK-NEXT: min.s32 %r11, %r4, %r8;
-; CHECK-NEXT: min.s32 %r12, %r2, %r6;
-; CHECK-NEXT: min.s32 %r13, %r12, %r11;
-; CHECK-NEXT: min.s32 %r14, %r10, %r9;
-; CHECK-NEXT: min.s32 %r15, %r14, %r13;
+; CHECK-NEXT: min.s32 %r9, %r4, %r8;
+; CHECK-NEXT: min.s32 %r10, %r2, %r6;
+; CHECK-NEXT: min.s32 %r11, %r10, %r9;
+; CHECK-NEXT: min.s32 %r12, %r3, %r7;
+; CHECK-NEXT: min.s32 %r13, %r1, %r5;
+; CHECK-NEXT: min.s32 %r14, %r13, %r12;
+; CHECK-NEXT: min.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in)
@@ -1604,6 +2387,22 @@ define i32 @reduce_smin_i32(<8 x i32> %in) {
}
define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_smin_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: min.s32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: min.s32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: min.s32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: min.s32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: min.s32 %r12, %r4, %r11;
+; CHECK-O0-NEXT: min.s32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smin_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1625,48 +2424,48 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_and_i16(<8 x i16> %in) {
-; CHECK-SM80-LABEL: reduce_and_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM80-NEXT: .reg .b32 %r<11>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
-; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4;
-; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3;
-; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
-; CHECK-SM80-NEXT: // implicit-def: %rs2
-; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
-; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_and_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
-; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_and_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
+; CHECK-NEXT: and.b32 %r5, %r2, %r4;
+; CHECK-NEXT: and.b32 %r6, %r1, %r3;
+; CHECK-NEXT: and.b32 %r7, %r6, %r5;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.and(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
+; CHECK-O0-LABEL: reduce_and_i16_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_and_i16_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
+; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
+; CHECK-O0-NEXT: mov.b16 %rs8, -1;
+; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-O0-NEXT: and.b32 %r5, %r3, %r4;
+; CHECK-O0-NEXT: and.b32 %r6, %r2, %r1;
+; CHECK-O0-NEXT: and.b32 %r7, %r6, %r5;
+; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-O0-NEXT: and.b16 %rs11, %rs9, %rs10;
+; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i16_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<12>;
@@ -1694,6 +2493,22 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_and_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_and_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
+; CHECK-O0-NEXT: and.b32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: and.b32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: and.b32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: and.b32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: and.b32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: and.b32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: and.b32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1701,13 +2516,13 @@ define i32 @reduce_and_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
-; CHECK-NEXT: and.b32 %r9, %r3, %r7;
-; CHECK-NEXT: and.b32 %r10, %r1, %r5;
-; CHECK-NEXT: and.b32 %r11, %r4, %r8;
-; CHECK-NEXT: and.b32 %r12, %r2, %r6;
-; CHECK-NEXT: and.b32 %r13, %r12, %r11;
-; CHECK-NEXT: and.b32 %r14, %r10, %r9;
-; CHECK-NEXT: and.b32 %r15, %r14, %r13;
+; CHECK-NEXT: and.b32 %r9, %r4, %r8;
+; CHECK-NEXT: and.b32 %r10, %r2, %r6;
+; CHECK-NEXT: and.b32 %r11, %r10, %r9;
+; CHECK-NEXT: and.b32 %r12, %r3, %r7;
+; CHECK-NEXT: and.b32 %r13, %r1, %r5;
+; CHECK-NEXT: and.b32 %r14, %r13, %r12;
+; CHECK-NEXT: and.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.and(<8 x i32> %in)
@@ -1715,6 +2530,22 @@ define i32 @reduce_and_i32(<8 x i32> %in) {
}
define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_and_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: and.b32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: and.b32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: and.b32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: and.b32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: and.b32 %r12, %r11, %r4;
+; CHECK-O0-NEXT: and.b32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1736,48 +2567,48 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_or_i16(<8 x i16> %in) {
-; CHECK-SM80-LABEL: reduce_or_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM80-NEXT: .reg .b32 %r<11>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
-; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4;
-; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3;
-; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
-; CHECK-SM80-NEXT: // implicit-def: %rs2
-; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
-; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_or_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
-; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_or_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
+; CHECK-NEXT: or.b32 %r5, %r2, %r4;
+; CHECK-NEXT: or.b32 %r6, %r1, %r3;
+; CHECK-NEXT: or.b32 %r7, %r6, %r5;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.or(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
+; CHECK-O0-LABEL: reduce_or_i16_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_or_i16_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
+; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
+; CHECK-O0-NEXT: mov.b16 %rs8, 0;
+; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-O0-NEXT: or.b32 %r5, %r3, %r4;
+; CHECK-O0-NEXT: or.b32 %r6, %r2, %r1;
+; CHECK-O0-NEXT: or.b32 %r7, %r6, %r5;
+; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-O0-NEXT: or.b16 %rs11, %rs9, %rs10;
+; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i16_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<12>;
@@ -1805,6 +2636,22 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_or_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_or_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
+; CHECK-O0-NEXT: or.b32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: or.b32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: or.b32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: or.b32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: or.b32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: or.b32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1812,13 +2659,13 @@ define i32 @reduce_or_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
-; CHECK-NEXT: or.b32 %r9, %r3, %r7;
-; CHECK-NEXT: or.b32 %r10, %r1, %r5;
-; CHECK-NEXT: or.b32 %r11, %r4, %r8;
-; CHECK-NEXT: or.b32 %r12, %r2, %r6;
-; CHECK-NEXT: or.b32 %r13, %r12, %r11;
-; CHECK-NEXT: or.b32 %r14, %r10, %r9;
-; CHECK-NEXT: or.b32 %r15, %r14, %r13;
+; CHECK-NEXT: or.b32 %r9, %r4, %r8;
+; CHECK-NEXT: or.b32 %r10, %r2, %r6;
+; CHECK-NEXT: or.b32 %r11, %r10, %r9;
+; CHECK-NEXT: or.b32 %r12, %r3, %r7;
+; CHECK-NEXT: or.b32 %r13, %r1, %r5;
+; CHECK-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-NEXT: or.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.or(<8 x i32> %in)
@@ -1826,6 +2673,22 @@ define i32 @reduce_or_i32(<8 x i32> %in) {
}
define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_or_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: or.b32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: or.b32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: or.b32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: or.b32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: or.b32 %r12, %r11, %r4;
+; CHECK-O0-NEXT: or.b32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1847,48 +2710,48 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
}
define i16 @reduce_xor_i16(<8 x i16> %in) {
-; CHECK-SM80-LABEL: reduce_xor_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM80-NEXT: .reg .b32 %r<11>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
-; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4;
-; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3;
-; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
-; CHECK-SM80-NEXT: // implicit-def: %rs2
-; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8;
-; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; }
-; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_xor_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<11>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
-; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7;
-; CHECK-SM100-NEXT: // implicit-def: %rs2
-; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2};
-; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8;
-; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_xor_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
+; CHECK-NEXT: xor.b32 %r5, %r2, %r4;
+; CHECK-NEXT: xor.b32 %r6, %r1, %r3;
+; CHECK-NEXT: xor.b32 %r7, %r6, %r5;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in)
ret i16 %res
}
define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
+; CHECK-O0-LABEL: reduce_xor_i16_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b16 %rs<12>;
+; CHECK-O0-NEXT: .reg .b32 %r<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
+; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
+; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
+; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
+; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
+; CHECK-O0-NEXT: mov.b16 %rs8, 0;
+; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
+; CHECK-O0-NEXT: xor.b32 %r5, %r3, %r4;
+; CHECK-O0-NEXT: xor.b32 %r6, %r2, %r1;
+; CHECK-O0-NEXT: xor.b32 %r7, %r6, %r5;
+; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
+; CHECK-O0-NEXT: xor.b16 %rs11, %rs9, %rs10;
+; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i16_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<12>;
@@ -1916,6 +2779,22 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
}
define i32 @reduce_xor_i32(<8 x i32> %in) {
+; CHECK-O0-LABEL: reduce_xor_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<16>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
+; CHECK-O0-NEXT: xor.b32 %r9, %r4, %r8;
+; CHECK-O0-NEXT: xor.b32 %r10, %r2, %r6;
+; CHECK-O0-NEXT: xor.b32 %r11, %r10, %r9;
+; CHECK-O0-NEXT: xor.b32 %r12, %r3, %r7;
+; CHECK-O0-NEXT: xor.b32 %r13, %r1, %r5;
+; CHECK-O0-NEXT: xor.b32 %r14, %r13, %r12;
+; CHECK-O0-NEXT: xor.b32 %r15, %r14, %r11;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1923,13 +2802,13 @@ define i32 @reduce_xor_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
-; CHECK-NEXT: xor.b32 %r9, %r3, %r7;
-; CHECK-NEXT: xor.b32 %r10, %r1, %r5;
-; CHECK-NEXT: xor.b32 %r11, %r4, %r8;
-; CHECK-NEXT: xor.b32 %r12, %r2, %r6;
-; CHECK-NEXT: xor.b32 %r13, %r12, %r11;
-; CHECK-NEXT: xor.b32 %r14, %r10, %r9;
-; CHECK-NEXT: xor.b32 %r15, %r14, %r13;
+; CHECK-NEXT: xor.b32 %r9, %r4, %r8;
+; CHECK-NEXT: xor.b32 %r10, %r2, %r6;
+; CHECK-NEXT: xor.b32 %r11, %r10, %r9;
+; CHECK-NEXT: xor.b32 %r12, %r3, %r7;
+; CHECK-NEXT: xor.b32 %r13, %r1, %r5;
+; CHECK-NEXT: xor.b32 %r14, %r13, %r12;
+; CHECK-NEXT: xor.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in)
@@ -1937,6 +2816,22 @@ define i32 @reduce_xor_i32(<8 x i32> %in) {
}
define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) {
+; CHECK-O0-LABEL: reduce_xor_i32_nonpow2(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<14>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
+; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
+; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
+; CHECK-O0-NEXT: xor.b32 %r8, %r3, %r7;
+; CHECK-O0-NEXT: xor.b32 %r9, %r1, %r5;
+; CHECK-O0-NEXT: xor.b32 %r10, %r9, %r8;
+; CHECK-O0-NEXT: xor.b32 %r11, %r2, %r6;
+; CHECK-O0-NEXT: xor.b32 %r12, %r11, %r4;
+; CHECK-O0-NEXT: xor.b32 %r13, %r10, %r12;
+; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
>From 59a0fc24111cd753b8ee2a21351efb5573a8fa80 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 17 Apr 2025 20:57:23 -0700
Subject: [PATCH 2/4] [NVPTX] support rest of VECREDUCE intrinsics and other
improvements
- Support all VECREDUCE intrinsics
- Clean up FileCheck directives in lit test
- Also handle sequential lowering in NVPTX backend, where we can still
use larger operations.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 176 +-
.../CodeGen/NVPTX/reduction-intrinsics.ll | 2633 ++++++-----------
2 files changed, 979 insertions(+), 1830 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bafd91b930052..45dd661613d81 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -870,12 +870,22 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine(ISD::SETCC);
// Vector reduction operations. These are transformed into a tree evaluation
- // of nodes which may or may not be legal.
+ // of nodes which may initially be illegal.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
- setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
- ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
- ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
- VT, Custom);
+ MVT EltVT = VT.getVectorElementType();
+ if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
+ EltVT == MVT::f64) {
+ setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+ VT, Custom);
+ } else if (EltVT.isScalarInteger()) {
+ setOperationAction(
+ {ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND,
+ ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX,
+ ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
+ VT, Custom);
+ }
}
// Promote fp16 arithmetic if fp16 hardware isn't available or the
@@ -2213,29 +2223,17 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-/// A generic routine for constructing a tree reduction for a vector operand.
+/// A generic routine for constructing a tree reduction on a vector operand.
/// This method differs from iterative splitting in DAGTypeLegalizer by
-/// first scalarizing the vector and then progressively grouping elements
-/// bottom-up. This allows easily building the optimal (minimum) number of nodes
-/// with different numbers of operands (eg. max3 vs max2).
+/// progressively grouping elements bottom-up.
static SDValue BuildTreeReduction(
- const SDValue &VectorOp,
+ const SmallVector<SDValue> &Elements, EVT EltTy,
ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
- EVT VectorTy = VectorOp.getValueType();
- EVT EltTy = VectorTy.getVectorElementType();
- const unsigned NumElts = VectorTy.getVectorNumElements();
-
- // scalarize vector
- SmallVector<SDValue> Elements(NumElts);
- for (unsigned I = 0, E = NumElts; I != E; ++I) {
- Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorOp,
- DAG.getConstant(I, DL, MVT::i64));
- }
-
// now build the computation graph in place at each level
SmallVector<SDValue> Level = Elements;
- for (unsigned OpIdx = 0; Level.size() > 1 && OpIdx < Ops.size();) {
+ unsigned OpIdx = 0;
+ while (Level.size() > 1) {
const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
// partially reduce all elements in level
@@ -2267,52 +2265,139 @@ static SDValue BuildTreeReduction(
return *Level.begin();
}
-/// Lower fadd/fmul vector reductions. Builds a computation graph (tree) and
-/// serializes it.
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
- // If we can't reorder sub-operations, let DAGTypeLegalizer lower this op.
- if (DisableFOpTreeReduce || !Op->getFlags().hasAllowReassociation())
+ if (DisableFOpTreeReduce)
return SDValue();
- EVT EltTy = Op.getOperand(0).getValueType().getVectorElementType();
+ SDLoc DL(Op);
+ const SDNodeFlags Flags = Op->getFlags();
+ const SDValue &Vector = Op.getOperand(0);
+ EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;
- SDLoc DL(Op);
- SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> Operators;
+
+ // A list of SDNode opcodes with equivalent semantics, sorted descending by
+ // number of inputs they take.
+ SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+ bool IsReassociatable;
+
switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
- Operators = {{ISD::FADD, 2}};
+ ScalarOps = {{ISD::FADD, 2}};
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMUL:
- Operators = {{ISD::FMUL, 2}};
+ ScalarOps = {{ISD::FMUL, 2}};
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMAX:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMAXNUM3, 3});
- Operators.push_back({ISD::FMAXNUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3});
+ ScalarOps.push_back({ISD::FMAXNUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMIN:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMINNUM3, 3});
- Operators.push_back({ISD::FMINNUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMINNUM3, 3});
+ ScalarOps.push_back({ISD::FMINNUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMAXIMUM:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMAXIMUM3, 3});
- Operators.push_back({ISD::FMAXIMUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3});
+ ScalarOps.push_back({ISD::FMAXIMUM, 2});
+ IsReassociatable = false;
break;
case ISD::VECREDUCE_FMINIMUM:
if (CanUseMinMax3)
- Operators.push_back({NVPTXISD::FMINIMUM3, 3});
- Operators.push_back({ISD::FMINIMUM, 2});
+ ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3});
+ ScalarOps.push_back({ISD::FMINIMUM, 2});
+ IsReassociatable = false;
+ break;
+ case ISD::VECREDUCE_ADD:
+ ScalarOps = {{ISD::ADD, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_MUL:
+ ScalarOps = {{ISD::MUL, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_UMAX:
+ ScalarOps = {{ISD::UMAX, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_UMIN:
+ ScalarOps = {{ISD::UMIN, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_SMAX:
+ ScalarOps = {{ISD::SMAX, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_SMIN:
+ ScalarOps = {{ISD::SMIN, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_AND:
+ ScalarOps = {{ISD::AND, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_OR:
+ ScalarOps = {{ISD::OR, 2}};
+ IsReassociatable = true;
+ break;
+ case ISD::VECREDUCE_XOR:
+ ScalarOps = {{ISD::XOR, 2}};
+ IsReassociatable = true;
break;
default:
llvm_unreachable("unhandled vecreduce operation");
}
- return BuildTreeReduction(Op.getOperand(0), Operators, DL, Op->getFlags(),
- DAG);
+ EVT VectorTy = Vector.getValueType();
+ const unsigned NumElts = VectorTy.getVectorNumElements();
+
+ // scalarize vector
+ SmallVector<SDValue> Elements(NumElts);
+ for (unsigned I = 0, E = NumElts; I != E; ++I) {
+ Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, Vector,
+ DAG.getConstant(I, DL, MVT::i64));
+ }
+
+ // Lower to tree reduction.
+ if (IsReassociatable || Flags.hasAllowReassociation())
+ return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+
+ // Lower to sequential reduction.
+ SDValue Accumulator;
+ for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
+ assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
+ const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
+
+ if (!Accumulator) {
+ if (I + DefaultGroupSize <= NumElts) {
+ Accumulator = DAG.getNode(
+ DefaultScalarOp, DL, EltTy,
+ ArrayRef(Elements).slice(I, I + DefaultGroupSize), Flags);
+ I += DefaultGroupSize;
+ }
+ }
+
+ if (Accumulator) {
+ for (; I + (DefaultGroupSize - 1) <= NumElts; I += DefaultGroupSize - 1) {
+ SmallVector<SDValue> Operands = {Accumulator};
+ for (unsigned K = 0; K < DefaultGroupSize - 1; ++K)
+ Operands.push_back(Elements[I + K]);
+ Accumulator = DAG.getNode(DefaultScalarOp, DL, EltTy, Operands, Flags);
+ }
+ }
+ }
+
+ return Accumulator;
}
SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
@@ -3153,6 +3238,15 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 90c6b0ebb6725..72cd7e38bda9f 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s
-; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 \
+; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_80 %}
-; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s
-; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 \
+; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \
; RUN: -disable-post-ra -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 %}
target triple = "nvptx64-nvidia-cuda"
@@ -42,6 +42,7 @@ define half @reduce_fadd_half(<8 x half> %in) {
ret half %res
}
+; Check tree reduction.
define half @reduce_fadd_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fadd_half_reassoc(
; CHECK: {
@@ -69,27 +70,8 @@ define half @reduce_fadd_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fadd_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<16>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fadd_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fadd_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fadd_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: mov.b16 %rs8, 0x0000;
-; CHECK-O0-NEXT: add.rn.f16 %rs9, %rs1, %rs8;
-; CHECK-O0-NEXT: add.rn.f16 %rs10, %rs9, %rs2;
-; CHECK-O0-NEXT: add.rn.f16 %rs11, %rs10, %rs3;
-; CHECK-O0-NEXT: add.rn.f16 %rs12, %rs11, %rs4;
-; CHECK-O0-NEXT: add.rn.f16 %rs13, %rs12, %rs5;
-; CHECK-O0-NEXT: add.rn.f16 %rs14, %rs13, %rs6;
-; CHECK-O0-NEXT: add.rn.f16 %rs15, %rs14, %rs7;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<16>;
@@ -116,23 +98,6 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fadd_float(<8 x float> %in) {
-; CHECK-O0-LABEL: reduce_fadd_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<17>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
-; CHECK-O0-NEXT: add.rn.f32 %r9, %r1, 0f00000000;
-; CHECK-O0-NEXT: add.rn.f32 %r10, %r9, %r2;
-; CHECK-O0-NEXT: add.rn.f32 %r11, %r10, %r3;
-; CHECK-O0-NEXT: add.rn.f32 %r12, %r11, %r4;
-; CHECK-O0-NEXT: add.rn.f32 %r13, %r12, %r5;
-; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, %r6;
-; CHECK-O0-NEXT: add.rn.f32 %r15, %r14, %r7;
-; CHECK-O0-NEXT: add.rn.f32 %r16, %r15, %r8;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r16;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<17>;
@@ -154,24 +119,8 @@ define float @reduce_fadd_float(<8 x float> %in) {
ret float %res
}
+; Check tree reduction.
define float @reduce_fadd_float_reassoc(<8 x float> %in) {
-; CHECK-O0-LABEL: reduce_fadd_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<17>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-O0-NEXT: add.rn.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: add.rn.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: add.rn.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: add.rn.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: add.rn.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: add.rn.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: add.rn.f32 %r16, %r15, 0f00000000;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r16;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<17>;
@@ -193,24 +142,8 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
ret float %res
}
+; Check tree reduction with non-power of 2 size.
define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-O0-LABEL: reduce_fadd_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<15>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: add.rn.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: add.rn.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: add.rn.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: add.rn.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: add.rn.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: add.rn.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: add.rn.f32 %r14, %r13, 0f00000000;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r14;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<15>;
@@ -258,6 +191,7 @@ define half @reduce_fmul_half(<8 x half> %in) {
ret half %res
}
+; Check tree reduction.
define half @reduce_fmul_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmul_half_reassoc(
; CHECK: {
@@ -283,25 +217,8 @@ define half @reduce_fmul_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fmul_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmul_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmul_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmul_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: mul.rn.f16 %rs8, %rs3, %rs4;
-; CHECK-O0-NEXT: mul.rn.f16 %rs9, %rs1, %rs2;
-; CHECK-O0-NEXT: mul.rn.f16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: mul.rn.f16 %rs11, %rs5, %rs6;
-; CHECK-O0-NEXT: mul.rn.f16 %rs12, %rs11, %rs7;
-; CHECK-O0-NEXT: mul.rn.f16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -326,22 +243,6 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmul_float(<8 x float> %in) {
-; CHECK-O0-LABEL: reduce_fmul_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
-; CHECK-O0-NEXT: mul.rn.f32 %r9, %r1, %r2;
-; CHECK-O0-NEXT: mul.rn.f32 %r10, %r9, %r3;
-; CHECK-O0-NEXT: mul.rn.f32 %r11, %r10, %r4;
-; CHECK-O0-NEXT: mul.rn.f32 %r12, %r11, %r5;
-; CHECK-O0-NEXT: mul.rn.f32 %r13, %r12, %r6;
-; CHECK-O0-NEXT: mul.rn.f32 %r14, %r13, %r7;
-; CHECK-O0-NEXT: mul.rn.f32 %r15, %r14, %r8;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -362,23 +263,8 @@ define float @reduce_fmul_float(<8 x float> %in) {
ret float %res
}
+; Check tree reduction.
define float @reduce_fmul_float_reassoc(<8 x float> %in) {
-; CHECK-O0-LABEL: reduce_fmul_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-O0-NEXT: mul.rn.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: mul.rn.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: mul.rn.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: mul.rn.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: mul.rn.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: mul.rn.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: mul.rn.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float_reassoc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -399,23 +285,8 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
ret float %res
}
+; Check tree reduction with non-power of 2 size.
define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-O0-LABEL: reduce_fmul_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: mul.rn.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: mul.rn.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: mul.rn.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: mul.rn.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: mul.rn.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: mul.rn.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -440,22 +311,29 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
define half @reduce_fmax_half(<8 x half> %in) {
; CHECK-LABEL: reduce_fmax_half(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
-; CHECK-NEXT: max.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: max.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: max.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: max.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: max.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: max.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: max.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: max.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: max.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmax(<8 x half> %in)
ret half %res
}
+; Check tree reduction.
define half @reduce_fmax_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmax_half_reassoc(
; CHECK: {
@@ -481,25 +359,8 @@ define half @reduce_fmax_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fmax_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: max.f16 %rs8, %rs3, %rs4;
-; CHECK-O0-NEXT: max.f16 %rs9, %rs1, %rs2;
-; CHECK-O0-NEXT: max.f16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: max.f16 %rs11, %rs5, %rs6;
-; CHECK-O0-NEXT: max.f16 %rs12, %rs11, %rs7;
-; CHECK-O0-NEXT: max.f16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmax_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -524,115 +385,108 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmax_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
+; CHECK-SM80-NEXT: max.f32 %r9, %r1, %r2;
+; CHECK-SM80-NEXT: max.f32 %r10, %r9, %r3;
+; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r4;
+; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r5;
+; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r6;
+; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r7;
+; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmax_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
-; CHECK-O0-NEXT: max.f32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: max.f32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: max.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.f32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: max.f32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: max.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmax_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
-; CHECK-NEXT: max.f32 %r9, %r4, %r8;
-; CHECK-NEXT: max.f32 %r10, %r2, %r6;
-; CHECK-NEXT: max.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.f32 %r12, %r3, %r7;
-; CHECK-NEXT: max.f32 %r13, %r1, %r5;
-; CHECK-NEXT: max.f32 %r14, %r13, %r12;
-; CHECK-NEXT: max.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmax_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
+; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r4, %r5;
+; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
ret float %res
}
+; Check tree reduction.
define float @reduce_fmax_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmax_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-O0-NEXT: max.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: max.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: max.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: max.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: max.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmax_float_reassoc(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT: max.f32 %r9, %r7, %r8;
-; CHECK-NEXT: max.f32 %r10, %r5, %r6;
-; CHECK-NEXT: max.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.f32 %r12, %r3, %r4;
-; CHECK-NEXT: max.f32 %r13, %r1, %r2;
-; CHECK-NEXT: max.f32 %r14, %r13, %r12;
-; CHECK-NEXT: max.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmax_float_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-SM100-NEXT: max.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r7;
+; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in)
ret float %res
}
+; Check tree reduction with non-power of 2 size.
define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmax_float_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: max.f32 %r8, %r5, %r6;
+; CHECK-SM80-NEXT: max.f32 %r9, %r8, %r7;
+; CHECK-SM80-NEXT: max.f32 %r10, %r3, %r4;
+; CHECK-SM80-NEXT: max.f32 %r11, %r1, %r2;
+; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r10;
+; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r9;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmax_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: max.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: max.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: max.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: max.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: max.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: max.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: max.f32 %r8, %r5, %r6;
-; CHECK-NEXT: max.f32 %r9, %r8, %r7;
-; CHECK-NEXT: max.f32 %r10, %r3, %r4;
-; CHECK-NEXT: max.f32 %r11, %r1, %r2;
-; CHECK-NEXT: max.f32 %r12, %r11, %r10;
-; CHECK-NEXT: max.f32 %r13, %r12, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmax_float_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: max.f32 %r8, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r8, %r7;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
ret float %res
}
@@ -641,22 +495,29 @@ define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
define half @reduce_fmin_half(<8 x half> %in) {
; CHECK-LABEL: reduce_fmin_half(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
-; CHECK-NEXT: min.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: min.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: min.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: min.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: min.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: min.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: min.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: min.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: min.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: min.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmin(<8 x half> %in)
ret half %res
}
+; Check tree reduction.
define half @reduce_fmin_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmin_half_reassoc(
; CHECK: {
@@ -682,25 +543,8 @@ define half @reduce_fmin_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fmin_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmin_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: min.f16 %rs8, %rs3, %rs4;
-; CHECK-O0-NEXT: min.f16 %rs9, %rs1, %rs2;
-; CHECK-O0-NEXT: min.f16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: min.f16 %rs11, %rs5, %rs6;
-; CHECK-O0-NEXT: min.f16 %rs12, %rs11, %rs7;
-; CHECK-O0-NEXT: min.f16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmin_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -725,115 +569,108 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmin_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
+; CHECK-SM80-NEXT: min.f32 %r9, %r1, %r2;
+; CHECK-SM80-NEXT: min.f32 %r10, %r9, %r3;
+; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r4;
+; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r5;
+; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r6;
+; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r7;
+; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmin_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
-; CHECK-O0-NEXT: min.f32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: min.f32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: min.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.f32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: min.f32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: min.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmin_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
-; CHECK-NEXT: min.f32 %r9, %r4, %r8;
-; CHECK-NEXT: min.f32 %r10, %r2, %r6;
-; CHECK-NEXT: min.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.f32 %r12, %r3, %r7;
-; CHECK-NEXT: min.f32 %r13, %r1, %r5;
-; CHECK-NEXT: min.f32 %r14, %r13, %r12;
-; CHECK-NEXT: min.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmin_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
+; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r4, %r5;
+; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmin(<8 x float> %in)
ret float %res
}
+; Check tree reduction.
define float @reduce_fmin_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmin_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-O0-NEXT: min.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: min.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: min.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: min.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: min.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmin_float_reassoc(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT: min.f32 %r9, %r7, %r8;
-; CHECK-NEXT: min.f32 %r10, %r5, %r6;
-; CHECK-NEXT: min.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.f32 %r12, %r3, %r4;
-; CHECK-NEXT: min.f32 %r13, %r1, %r2;
-; CHECK-NEXT: min.f32 %r14, %r13, %r12;
-; CHECK-NEXT: min.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmin_float_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-SM100-NEXT: min.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r7;
+; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in)
ret float %res
}
+; Check tree reduction with non-power of 2 size.
define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmin_float_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: min.f32 %r8, %r5, %r6;
+; CHECK-SM80-NEXT: min.f32 %r9, %r8, %r7;
+; CHECK-SM80-NEXT: min.f32 %r10, %r3, %r4;
+; CHECK-SM80-NEXT: min.f32 %r11, %r1, %r2;
+; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r10;
+; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r9;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmin_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: min.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: min.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: min.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: min.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: min.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: min.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmin_float_reassoc_nonpow2(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: min.f32 %r8, %r5, %r6;
-; CHECK-NEXT: min.f32 %r9, %r8, %r7;
-; CHECK-NEXT: min.f32 %r10, %r3, %r4;
-; CHECK-NEXT: min.f32 %r11, %r1, %r2;
-; CHECK-NEXT: min.f32 %r12, %r11, %r10;
-; CHECK-NEXT: min.f32 %r13, %r12, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmin_float_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: min.f32 %r8, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r8, %r7;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in)
ret float %res
}
@@ -842,22 +679,29 @@ define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
define half @reduce_fmaximum_half(<8 x half> %in) {
; CHECK-LABEL: reduce_fmaximum_half(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0];
-; CHECK-NEXT: max.NaN.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: max.NaN.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: max.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-NEXT: max.NaN.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: max.NaN.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: max.NaN.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: max.NaN.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: max.NaN.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmaximum(<8 x half> %in)
ret half %res
}
+; Check tree reduction.
define half @reduce_fmaximum_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fmaximum_half_reassoc(
; CHECK: {
@@ -883,25 +727,8 @@ define half @reduce_fmaximum_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fmaximum_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fmaximum_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fmaximum_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmaximum_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: max.NaN.f16 %rs8, %rs3, %rs4;
-; CHECK-O0-NEXT: max.NaN.f16 %rs9, %rs1, %rs2;
-; CHECK-O0-NEXT: max.NaN.f16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: max.NaN.f16 %rs11, %rs5, %rs6;
-; CHECK-O0-NEXT: max.NaN.f16 %rs12, %rs11, %rs7;
-; CHECK-O0-NEXT: max.NaN.f16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fmaximum_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -926,115 +753,108 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fmaximum_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r1, %r2;
+; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r9, %r3;
+; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r4;
+; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r11, %r5;
+; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r12, %r6;
+; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r7;
+; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmaximum_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
-; CHECK-O0-NEXT: max.NaN.f32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: max.NaN.f32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: max.NaN.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.NaN.f32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: max.NaN.f32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: max.NaN.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.NaN.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmaximum_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
-; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6;
-; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5;
-; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12;
-; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmaximum_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r9, %r4, %r5;
+; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in)
ret float %res
}
+; Check tree reduction.
define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmaximum_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-O0-NEXT: max.NaN.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: max.NaN.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: max.NaN.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.NaN.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: max.NaN.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: max.NaN.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.NaN.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmaximum_float_reassoc(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT: max.NaN.f32 %r9, %r7, %r8;
-; CHECK-NEXT: max.NaN.f32 %r10, %r5, %r6;
-; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r4;
-; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r2;
-; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12;
-; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r7;
+; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in)
ret float %res
}
+; Check tree reduction with non-power of 2 size.
define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: max.NaN.f32 %r8, %r5, %r6;
+; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r8, %r7;
+; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r3, %r4;
+; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r1, %r2;
+; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r11, %r10;
+; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r12, %r9;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: max.NaN.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: max.NaN.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: max.NaN.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: max.NaN.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: max.NaN.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: max.NaN.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: max.NaN.f32 %r8, %r5, %r6;
-; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r7;
-; CHECK-NEXT: max.NaN.f32 %r10, %r3, %r4;
-; CHECK-NEXT: max.NaN.f32 %r11, %r1, %r2;
-; CHECK-NEXT: max.NaN.f32 %r12, %r11, %r10;
-; CHECK-NEXT: max.NaN.f32 %r13, %r12, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc_nonpow2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<11>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: max.NaN.f32 %r8, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r9, %r8, %r7;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-SM100-NEXT: ret;
%res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in)
ret float %res
}
@@ -1043,22 +863,29 @@ define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
define half @reduce_fminimum_half(<8 x half> %in) {
; CHECK-LABEL: reduce_fminimum_half(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0];
-; CHECK-NEXT: min.NaN.f16x2 %r5, %r2, %r4;
-; CHECK-NEXT: min.NaN.f16x2 %r6, %r1, %r3;
-; CHECK-NEXT: min.NaN.f16x2 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-NEXT: min.NaN.f16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: min.NaN.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: min.NaN.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: min.NaN.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: min.NaN.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fminimum(<8 x half> %in)
ret half %res
}
+; Check tree reduction.
define half @reduce_fminimum_half_reassoc(<8 x half> %in) {
; CHECK-LABEL: reduce_fminimum_half_reassoc(
; CHECK: {
@@ -1084,25 +911,8 @@ define half @reduce_fminimum_half_reassoc(<8 x half> %in) {
ret half %res
}
+; Check tree reduction with non-power of 2 size.
define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) {
-; CHECK-O0-LABEL: reduce_fminimum_half_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_fminimum_half_reassoc_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_fminimum_half_reassoc_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fminimum_half_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: min.NaN.f16 %rs8, %rs3, %rs4;
-; CHECK-O0-NEXT: min.NaN.f16 %rs9, %rs1, %rs2;
-; CHECK-O0-NEXT: min.NaN.f16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: min.NaN.f16 %rs11, %rs5, %rs6;
-; CHECK-O0-NEXT: min.NaN.f16 %rs12, %rs11, %rs7;
-; CHECK-O0-NEXT: min.NaN.f16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: st.param.b16 [func_retval0], %rs13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_fminimum_half_reassoc_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -1127,270 +937,166 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) {
; Check straight-line reduction.
define float @reduce_fminimum_float(<8 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_float(
+; CHECK-SM80: {
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT: // %bb.0:
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r1, %r2;
+; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r9, %r3;
+; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r4;
+; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r11, %r5;
+; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r12, %r6;
+; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r7;
+; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT: ret;
;
-; CHECK-O0-LABEL: reduce_fminimum_float(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
-; CHECK-O0-NEXT: min.NaN.f32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: min.NaN.f32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: min.NaN.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.NaN.f32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: min.NaN.f32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: min.NaN.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.NaN.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fminimum_float(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
-; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6;
-; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5;
-; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12;
-; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
+; CHECK-SM100-LABEL: reduce_fminimum_float(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r9, %r4, %r5;
+; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT: ret;
%res = call float @llvm.vector.reduce.fminimum(<8 x float> %in)
ret float %res
}
+; Check tree reduction.
define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
-;
-; CHECK-O0-LABEL: reduce_fminimum_float_reassoc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-O0-NEXT: min.NaN.f32 %r9, %r7, %r8;
-; CHECK-O0-NEXT: min.NaN.f32 %r10, %r5, %r6;
-; CHECK-O0-NEXT: min.NaN.f32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.NaN.f32 %r12, %r3, %r4;
-; CHECK-O0-NEXT: min.NaN.f32 %r13, %r1, %r2;
-; CHECK-O0-NEXT: min.NaN.f32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.NaN.f32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fminimum_float_reassoc(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT: min.NaN.f32 %r9, %r7, %r8;
-; CHECK-NEXT: min.NaN.f32 %r10, %r5, %r6;
-; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r4;
-; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r2;
-; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12;
-; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT: ret;
- %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in)
- ret float %res
-}
-
-define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
-;
-; CHECK-O0-LABEL: reduce_fminimum_float_reassoc_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
-; CHECK-O0-NEXT: min.NaN.f32 %r8, %r5, %r6;
-; CHECK-O0-NEXT: min.NaN.f32 %r9, %r8, %r7;
-; CHECK-O0-NEXT: min.NaN.f32 %r10, %r3, %r4;
-; CHECK-O0-NEXT: min.NaN.f32 %r11, %r1, %r2;
-; CHECK-O0-NEXT: min.NaN.f32 %r12, %r11, %r10;
-; CHECK-O0-NEXT: min.NaN.f32 %r13, %r12, %r9;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
-; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<14>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT: min.NaN.f32 %r8, %r5, %r6;
-; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r7;
-; CHECK-NEXT: min.NaN.f32 %r10, %r3, %r4;
-; CHECK-NEXT: min.NaN.f32 %r11, %r1, %r2;
-; CHECK-NEXT: min.NaN.f32 %r12, %r11, %r10;
-; CHECK-NEXT: min.NaN.f32 %r13, %r12, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-NEXT: ret;
- %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
- ret float %res
-}
-
-define i16 @reduce_add_i16(<8 x i16> %in) {
-;
-; CHECK-SM80-LABEL: reduce_add_i16(
+; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc(
; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
-; CHECK-SM80-NEXT: .reg .b32 %r<6>;
+; CHECK-SM80-NEXT: .reg .b32 %r<16>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
-; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: add.s16 %rs5, %rs4, %rs2;
-; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs7;
-; CHECK-SM80-NEXT: add.s16 %rs11, %rs10, %rs5;
-; CHECK-SM80-NEXT: add.s16 %rs12, %rs3, %rs1;
-; CHECK-SM80-NEXT: add.s16 %rs13, %rs8, %rs6;
-; CHECK-SM80-NEXT: add.s16 %rs14, %rs13, %rs12;
-; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs11;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_add_i16(
+; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-NEXT: .reg .b32 %r<13>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
-; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r7;
+; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r8;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-SM100-NEXT: ret;
- %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in)
- ret i16 %res
+ %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in)
+ ret float %res
}
-define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
-;
-;
-; CHECK-SM80-O0-LABEL: reduce_add_i16_nonpow2(
-; CHECK-SM80-O0: {
-; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-O0-EMPTY:
-; CHECK-SM80-O0-NEXT: // %bb.0:
-; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
-; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
-; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
-; CHECK-SM80-O0-NEXT: add.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-O0-NEXT: add.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-O0-NEXT: add.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-O0-NEXT: add.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-O0-NEXT: add.s16 %rs12, %rs11, %rs4;
-; CHECK-SM80-O0-NEXT: add.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-O0-NEXT: ret;
-;
-; CHECK-SM100-O0-LABEL: reduce_add_i16_nonpow2(
-; CHECK-SM100-O0: {
-; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-O0-EMPTY:
-; CHECK-SM100-O0-NEXT: // %bb.0:
-; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
-; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
-; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 0;
-; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-O0-NEXT: add.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-O0-NEXT: add.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-O0-NEXT: add.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-O0-NEXT: add.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-O0-NEXT: ret;
-; CHECK-SM80-LABEL: reduce_add_i16_nonpow2(
+; Check tree reducion with non-power of 2 size.
+define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
+; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc_nonpow2(
; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-NEXT: .reg .b32 %r<3>;
+; CHECK-SM80-NEXT: .reg .b32 %r<14>;
; CHECK-SM80-EMPTY:
; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
-; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
-; CHECK-SM80-NEXT: add.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-NEXT: add.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-NEXT: add.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-NEXT: add.s16 %rs12, %rs11, %rs4;
-; CHECK-SM80-NEXT: add.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT: min.NaN.f32 %r8, %r5, %r6;
+; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r8, %r7;
+; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r3, %r4;
+; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r1, %r2;
+; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r11, %r10;
+; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r12, %r9;
+; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-SM80-NEXT: ret;
;
-; CHECK-SM100-LABEL: reduce_add_i16_nonpow2(
+; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc_nonpow2(
; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
+; CHECK-SM100-NEXT: .reg .b32 %r<11>;
; CHECK-SM100-EMPTY:
; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
-; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
-; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
-; CHECK-SM100-NEXT: mov.b16 %rs8, 0;
-; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-NEXT: add.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-NEXT: add.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-NEXT: add.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT: min.NaN.f32 %r8, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r9, %r8, %r7;
+; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-SM100-NEXT: ret;
+ %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
+ ret float %res
+}
+
+; Check tree reduction.
+define i16 @reduce_add_i16(<8 x i16> %in) {
+; CHECK-LABEL: reduce_add_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: add.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: add.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: add.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: add.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: add.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: add.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
+ %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in)
+ ret i16 %res
+}
+
+; Check tree reduction with non-power of 2 size.
+define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
+; CHECK-LABEL: reduce_add_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-NEXT: add.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: add.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: add.s16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: add.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.add(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_add_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_add_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
-; CHECK-O0-NEXT: add.s32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: add.s32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: add.s32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: add.s32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: add.s32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: add.s32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: add.s32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_add_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1398,11 +1104,11 @@ define i32 @reduce_add_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
-; CHECK-NEXT: add.s32 %r9, %r4, %r8;
-; CHECK-NEXT: add.s32 %r10, %r2, %r6;
+; CHECK-NEXT: add.s32 %r9, %r7, %r8;
+; CHECK-NEXT: add.s32 %r10, %r5, %r6;
; CHECK-NEXT: add.s32 %r11, %r10, %r9;
-; CHECK-NEXT: add.s32 %r12, %r3, %r7;
-; CHECK-NEXT: add.s32 %r13, %r1, %r5;
+; CHECK-NEXT: add.s32 %r12, %r3, %r4;
+; CHECK-NEXT: add.s32 %r13, %r1, %r2;
; CHECK-NEXT: add.s32 %r14, %r13, %r12;
; CHECK-NEXT: add.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -1411,23 +1117,8 @@ define i32 @reduce_add_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_add_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: add.s32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: add.s32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: add.s32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: add.s32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: add.s32 %r12, %r11, %r4;
-; CHECK-O0-NEXT: add.s32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_add_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1436,18 +1127,19 @@ define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
-; CHECK-NEXT: add.s32 %r8, %r3, %r7;
-; CHECK-NEXT: add.s32 %r9, %r1, %r5;
-; CHECK-NEXT: add.s32 %r10, %r9, %r8;
-; CHECK-NEXT: add.s32 %r11, %r2, %r6;
-; CHECK-NEXT: add.s32 %r12, %r11, %r4;
-; CHECK-NEXT: add.s32 %r13, %r10, %r12;
+; CHECK-NEXT: add.s32 %r8, %r5, %r6;
+; CHECK-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-NEXT: add.s32 %r10, %r3, %r4;
+; CHECK-NEXT: add.s32 %r11, %r1, %r2;
+; CHECK-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-NEXT: add.s32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.add(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_mul_i16(<8 x i16> %in) {
; CHECK-LABEL: reduce_mul_i16(
; CHECK: {
@@ -1457,16 +1149,16 @@ define i16 @reduce_mul_i16(<8 x i16> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NEXT: mul.lo.s16 %rs5, %rs4, %rs2;
-; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs7;
-; CHECK-NEXT: mul.lo.s16 %rs11, %rs10, %rs5;
-; CHECK-NEXT: mul.lo.s16 %rs12, %rs3, %rs1;
-; CHECK-NEXT: mul.lo.s16 %rs13, %rs8, %rs6;
-; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs12;
-; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs11;
+; CHECK-NEXT: mul.lo.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: mul.lo.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: mul.lo.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: mul.lo.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: mul.lo.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
@@ -1474,26 +1166,8 @@ define i16 @reduce_mul_i16(<8 x i16> %in) {
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
-; CHECK-O0-LABEL: reduce_mul_i16_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_mul_i16_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
-; CHECK-O0-NEXT: mul.lo.s16 %rs8, %rs3, %rs7;
-; CHECK-O0-NEXT: mul.lo.s16 %rs9, %rs1, %rs5;
-; CHECK-O0-NEXT: mul.lo.s16 %rs10, %rs9, %rs8;
-; CHECK-O0-NEXT: mul.lo.s16 %rs11, %rs2, %rs6;
-; CHECK-O0-NEXT: mul.lo.s16 %rs12, %rs4, %rs11;
-; CHECK-O0-NEXT: mul.lo.s16 %rs13, %rs10, %rs12;
-; CHECK-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i16_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<14>;
@@ -1504,11 +1178,11 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
-; CHECK-NEXT: mul.lo.s16 %rs8, %rs3, %rs7;
-; CHECK-NEXT: mul.lo.s16 %rs9, %rs1, %rs5;
+; CHECK-NEXT: mul.lo.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: mul.lo.s16 %rs9, %rs1, %rs2;
; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs8;
-; CHECK-NEXT: mul.lo.s16 %rs11, %rs2, %rs6;
-; CHECK-NEXT: mul.lo.s16 %rs12, %rs4, %rs11;
+; CHECK-NEXT: mul.lo.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: mul.lo.s16 %rs12, %rs7, %rs11;
; CHECK-NEXT: mul.lo.s16 %rs13, %rs10, %rs12;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
@@ -1517,23 +1191,8 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_mul_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_mul_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
-; CHECK-O0-NEXT: mul.lo.s32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: mul.lo.s32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: mul.lo.s32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: mul.lo.s32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: mul.lo.s32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: mul.lo.s32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: mul.lo.s32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1541,11 +1200,11 @@ define i32 @reduce_mul_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
-; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8;
-; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6;
+; CHECK-NEXT: mul.lo.s32 %r9, %r7, %r8;
+; CHECK-NEXT: mul.lo.s32 %r10, %r5, %r6;
; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9;
-; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7;
-; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5;
+; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r4;
+; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r2;
; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12;
; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -1554,23 +1213,8 @@ define i32 @reduce_mul_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_mul_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: mul.lo.s32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: mul.lo.s32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: mul.lo.s32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: mul.lo.s32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: mul.lo.s32 %r12, %r4, %r11;
-; CHECK-O0-NEXT: mul.lo.s32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_mul_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1579,169 +1223,72 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
-; CHECK-NEXT: mul.lo.s32 %r8, %r3, %r7;
-; CHECK-NEXT: mul.lo.s32 %r9, %r1, %r5;
-; CHECK-NEXT: mul.lo.s32 %r10, %r9, %r8;
-; CHECK-NEXT: mul.lo.s32 %r11, %r2, %r6;
-; CHECK-NEXT: mul.lo.s32 %r12, %r4, %r11;
-; CHECK-NEXT: mul.lo.s32 %r13, %r10, %r12;
+; CHECK-NEXT: mul.lo.s32 %r8, %r5, %r6;
+; CHECK-NEXT: mul.lo.s32 %r9, %r8, %r7;
+; CHECK-NEXT: mul.lo.s32 %r10, %r3, %r4;
+; CHECK-NEXT: mul.lo.s32 %r11, %r1, %r2;
+; CHECK-NEXT: mul.lo.s32 %r12, %r11, %r10;
+; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.mul(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_umax_i16(<8 x i16> %in) {
-;
-; CHECK-SM80-LABEL: reduce_umax_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
-; CHECK-SM80-NEXT: .reg .b32 %r<6>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
-; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: max.u16 %rs5, %rs4, %rs2;
-; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs7;
-; CHECK-SM80-NEXT: max.u16 %rs11, %rs10, %rs5;
-; CHECK-SM80-NEXT: max.u16 %rs12, %rs3, %rs1;
-; CHECK-SM80-NEXT: max.u16 %rs13, %rs8, %rs6;
-; CHECK-SM80-NEXT: max.u16 %rs14, %rs13, %rs12;
-; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs11;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_umax_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
-; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_umax_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.u16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.u16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.u16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.u16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.u16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.u16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
-;
-;
-; CHECK-SM80-O0-LABEL: reduce_umax_i16_nonpow2(
-; CHECK-SM80-O0: {
-; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-O0-EMPTY:
-; CHECK-SM80-O0-NEXT: // %bb.0:
-; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
-; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
-; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
-; CHECK-SM80-O0-NEXT: max.u16 %rs8, %rs3, %rs7;
-; CHECK-SM80-O0-NEXT: max.u16 %rs9, %rs1, %rs5;
-; CHECK-SM80-O0-NEXT: max.u16 %rs10, %rs9, %rs8;
-; CHECK-SM80-O0-NEXT: max.u16 %rs11, %rs2, %rs6;
-; CHECK-SM80-O0-NEXT: max.u16 %rs12, %rs4, %rs11;
-; CHECK-SM80-O0-NEXT: max.u16 %rs13, %rs10, %rs12;
-; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-O0-NEXT: ret;
-;
-; CHECK-SM100-O0-LABEL: reduce_umax_i16_nonpow2(
-; CHECK-SM100-O0: {
-; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-O0-EMPTY:
-; CHECK-SM100-O0-NEXT: // %bb.0:
-; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
-; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
-; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 0;
-; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-O0-NEXT: max.u16x2 %r5, %r3, %r4;
-; CHECK-SM100-O0-NEXT: max.u16x2 %r6, %r2, %r1;
-; CHECK-SM100-O0-NEXT: max.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-O0-NEXT: max.u16 %rs11, %rs9, %rs10;
-; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-O0-NEXT: ret;
-; CHECK-SM80-LABEL: reduce_umax_i16_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
-; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
-; CHECK-SM80-NEXT: max.u16 %rs8, %rs3, %rs7;
-; CHECK-SM80-NEXT: max.u16 %rs9, %rs1, %rs5;
-; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs8;
-; CHECK-SM80-NEXT: max.u16 %rs11, %rs2, %rs6;
-; CHECK-SM80-NEXT: max.u16 %rs12, %rs4, %rs11;
-; CHECK-SM80-NEXT: max.u16 %rs13, %rs10, %rs12;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_umax_i16_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
-; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
-; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
-; CHECK-SM100-NEXT: mov.b16 %rs8, 0;
-; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-NEXT: max.u16x2 %r5, %r3, %r4;
-; CHECK-SM100-NEXT: max.u16x2 %r6, %r2, %r1;
-; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-NEXT: max.u16 %rs11, %rs9, %rs10;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_umax_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-NEXT: max.u16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.u16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.u16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.u16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.u16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: max.u16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umax(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_umax_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_umax_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
-; CHECK-O0-NEXT: max.u32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: max.u32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: max.u32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.u32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: max.u32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: max.u32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.u32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umax_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1749,11 +1296,11 @@ define i32 @reduce_umax_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
-; CHECK-NEXT: max.u32 %r9, %r4, %r8;
-; CHECK-NEXT: max.u32 %r10, %r2, %r6;
+; CHECK-NEXT: max.u32 %r9, %r7, %r8;
+; CHECK-NEXT: max.u32 %r10, %r5, %r6;
; CHECK-NEXT: max.u32 %r11, %r10, %r9;
-; CHECK-NEXT: max.u32 %r12, %r3, %r7;
-; CHECK-NEXT: max.u32 %r13, %r1, %r5;
+; CHECK-NEXT: max.u32 %r12, %r3, %r4;
+; CHECK-NEXT: max.u32 %r13, %r1, %r2;
; CHECK-NEXT: max.u32 %r14, %r13, %r12;
; CHECK-NEXT: max.u32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -1762,23 +1309,8 @@ define i32 @reduce_umax_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_umax_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: max.u32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: max.u32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: max.u32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: max.u32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: max.u32 %r12, %r4, %r11;
-; CHECK-O0-NEXT: max.u32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umax_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1787,169 +1319,72 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
-; CHECK-NEXT: max.u32 %r8, %r3, %r7;
-; CHECK-NEXT: max.u32 %r9, %r1, %r5;
-; CHECK-NEXT: max.u32 %r10, %r9, %r8;
-; CHECK-NEXT: max.u32 %r11, %r2, %r6;
-; CHECK-NEXT: max.u32 %r12, %r4, %r11;
-; CHECK-NEXT: max.u32 %r13, %r10, %r12;
+; CHECK-NEXT: max.u32 %r8, %r5, %r6;
+; CHECK-NEXT: max.u32 %r9, %r8, %r7;
+; CHECK-NEXT: max.u32 %r10, %r3, %r4;
+; CHECK-NEXT: max.u32 %r11, %r1, %r2;
+; CHECK-NEXT: max.u32 %r12, %r11, %r10;
+; CHECK-NEXT: max.u32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umax(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_umin_i16(<8 x i16> %in) {
-;
-; CHECK-SM80-LABEL: reduce_umin_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
-; CHECK-SM80-NEXT: .reg .b32 %r<6>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
-; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: min.u16 %rs5, %rs4, %rs2;
-; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs7;
-; CHECK-SM80-NEXT: min.u16 %rs11, %rs10, %rs5;
-; CHECK-SM80-NEXT: min.u16 %rs12, %rs3, %rs1;
-; CHECK-SM80-NEXT: min.u16 %rs13, %rs8, %rs6;
-; CHECK-SM80-NEXT: min.u16 %rs14, %rs13, %rs12;
-; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs11;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_umin_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
-; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_umin_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.u16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.u16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.u16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.u16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.u16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.u16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.u16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
-;
-;
-; CHECK-SM80-O0-LABEL: reduce_umin_i16_nonpow2(
-; CHECK-SM80-O0: {
-; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-O0-EMPTY:
-; CHECK-SM80-O0-NEXT: // %bb.0:
-; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
-; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
-; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
-; CHECK-SM80-O0-NEXT: min.u16 %rs8, %rs3, %rs7;
-; CHECK-SM80-O0-NEXT: min.u16 %rs9, %rs1, %rs5;
-; CHECK-SM80-O0-NEXT: min.u16 %rs10, %rs9, %rs8;
-; CHECK-SM80-O0-NEXT: min.u16 %rs11, %rs2, %rs6;
-; CHECK-SM80-O0-NEXT: min.u16 %rs12, %rs4, %rs11;
-; CHECK-SM80-O0-NEXT: min.u16 %rs13, %rs10, %rs12;
-; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-O0-NEXT: ret;
-;
-; CHECK-SM100-O0-LABEL: reduce_umin_i16_nonpow2(
-; CHECK-SM100-O0: {
-; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-O0-EMPTY:
-; CHECK-SM100-O0-NEXT: // %bb.0:
-; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
-; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
-; CHECK-SM100-O0-NEXT: mov.b16 %rs8, -1;
-; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-O0-NEXT: min.u16x2 %r5, %r3, %r4;
-; CHECK-SM100-O0-NEXT: min.u16x2 %r6, %r2, %r1;
-; CHECK-SM100-O0-NEXT: min.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-O0-NEXT: min.u16 %rs11, %rs9, %rs10;
-; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-O0-NEXT: ret;
-; CHECK-SM80-LABEL: reduce_umin_i16_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
-; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
-; CHECK-SM80-NEXT: min.u16 %rs8, %rs3, %rs7;
-; CHECK-SM80-NEXT: min.u16 %rs9, %rs1, %rs5;
-; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs8;
-; CHECK-SM80-NEXT: min.u16 %rs11, %rs2, %rs6;
-; CHECK-SM80-NEXT: min.u16 %rs12, %rs4, %rs11;
-; CHECK-SM80-NEXT: min.u16 %rs13, %rs10, %rs12;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_umin_i16_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
-; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
-; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
-; CHECK-SM100-NEXT: mov.b16 %rs8, -1;
-; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-NEXT: min.u16x2 %r5, %r3, %r4;
-; CHECK-SM100-NEXT: min.u16x2 %r6, %r2, %r1;
-; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-NEXT: min.u16 %rs11, %rs9, %rs10;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_umin_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-NEXT: min.u16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.u16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.u16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.u16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.u16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: min.u16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.umin(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_umin_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_umin_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
-; CHECK-O0-NEXT: min.u32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: min.u32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: min.u32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.u32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: min.u32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: min.u32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.u32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umin_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -1957,11 +1392,11 @@ define i32 @reduce_umin_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
-; CHECK-NEXT: min.u32 %r9, %r4, %r8;
-; CHECK-NEXT: min.u32 %r10, %r2, %r6;
+; CHECK-NEXT: min.u32 %r9, %r7, %r8;
+; CHECK-NEXT: min.u32 %r10, %r5, %r6;
; CHECK-NEXT: min.u32 %r11, %r10, %r9;
-; CHECK-NEXT: min.u32 %r12, %r3, %r7;
-; CHECK-NEXT: min.u32 %r13, %r1, %r5;
+; CHECK-NEXT: min.u32 %r12, %r3, %r4;
+; CHECK-NEXT: min.u32 %r13, %r1, %r2;
; CHECK-NEXT: min.u32 %r14, %r13, %r12;
; CHECK-NEXT: min.u32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -1970,23 +1405,8 @@ define i32 @reduce_umin_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_umin_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: min.u32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: min.u32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: min.u32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: min.u32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: min.u32 %r12, %r4, %r11;
-; CHECK-O0-NEXT: min.u32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_umin_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -1995,169 +1415,72 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
-; CHECK-NEXT: min.u32 %r8, %r3, %r7;
-; CHECK-NEXT: min.u32 %r9, %r1, %r5;
-; CHECK-NEXT: min.u32 %r10, %r9, %r8;
-; CHECK-NEXT: min.u32 %r11, %r2, %r6;
-; CHECK-NEXT: min.u32 %r12, %r4, %r11;
-; CHECK-NEXT: min.u32 %r13, %r10, %r12;
+; CHECK-NEXT: min.u32 %r8, %r5, %r6;
+; CHECK-NEXT: min.u32 %r9, %r8, %r7;
+; CHECK-NEXT: min.u32 %r10, %r3, %r4;
+; CHECK-NEXT: min.u32 %r11, %r1, %r2;
+; CHECK-NEXT: min.u32 %r12, %r11, %r10;
+; CHECK-NEXT: min.u32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.umin(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_smax_i16(<8 x i16> %in) {
-;
-; CHECK-SM80-LABEL: reduce_smax_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
-; CHECK-SM80-NEXT: .reg .b32 %r<6>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
-; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: max.s16 %rs5, %rs4, %rs2;
-; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs7;
-; CHECK-SM80-NEXT: max.s16 %rs11, %rs10, %rs5;
-; CHECK-SM80-NEXT: max.s16 %rs12, %rs3, %rs1;
-; CHECK-SM80-NEXT: max.s16 %rs13, %rs8, %rs6;
-; CHECK-SM80-NEXT: max.s16 %rs14, %rs13, %rs12;
-; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs11;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_smax_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
-; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_smax_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
-;
-;
-; CHECK-SM80-O0-LABEL: reduce_smax_i16_nonpow2(
-; CHECK-SM80-O0: {
-; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-O0-EMPTY:
-; CHECK-SM80-O0-NEXT: // %bb.0:
-; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
-; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
-; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
-; CHECK-SM80-O0-NEXT: max.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-O0-NEXT: max.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-O0-NEXT: max.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-O0-NEXT: max.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-O0-NEXT: max.s16 %rs12, %rs4, %rs11;
-; CHECK-SM80-O0-NEXT: max.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-O0-NEXT: ret;
-;
-; CHECK-SM100-O0-LABEL: reduce_smax_i16_nonpow2(
-; CHECK-SM100-O0: {
-; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-O0-EMPTY:
-; CHECK-SM100-O0-NEXT: // %bb.0:
-; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
-; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
-; CHECK-SM100-O0-NEXT: mov.b16 %rs8, -32768;
-; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-O0-NEXT: max.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-O0-NEXT: max.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-O0-NEXT: max.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-O0-NEXT: max.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-O0-NEXT: ret;
-; CHECK-SM80-LABEL: reduce_smax_i16_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
-; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
-; CHECK-SM80-NEXT: max.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-NEXT: max.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-NEXT: max.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-NEXT: max.s16 %rs12, %rs4, %rs11;
-; CHECK-SM80-NEXT: max.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_smax_i16_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
-; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
-; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
-; CHECK-SM100-NEXT: mov.b16 %rs8, -32768;
-; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-NEXT: max.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-NEXT: max.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-NEXT: max.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_smax_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-NEXT: max.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: max.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: max.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: max.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: max.s16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: max.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smax(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_smax_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_smax_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
-; CHECK-O0-NEXT: max.s32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: max.s32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: max.s32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: max.s32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: max.s32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: max.s32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: max.s32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smax_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -2165,11 +1488,11 @@ define i32 @reduce_smax_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
-; CHECK-NEXT: max.s32 %r9, %r4, %r8;
-; CHECK-NEXT: max.s32 %r10, %r2, %r6;
+; CHECK-NEXT: max.s32 %r9, %r7, %r8;
+; CHECK-NEXT: max.s32 %r10, %r5, %r6;
; CHECK-NEXT: max.s32 %r11, %r10, %r9;
-; CHECK-NEXT: max.s32 %r12, %r3, %r7;
-; CHECK-NEXT: max.s32 %r13, %r1, %r5;
+; CHECK-NEXT: max.s32 %r12, %r3, %r4;
+; CHECK-NEXT: max.s32 %r13, %r1, %r2;
; CHECK-NEXT: max.s32 %r14, %r13, %r12;
; CHECK-NEXT: max.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -2178,23 +1501,8 @@ define i32 @reduce_smax_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_smax_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: max.s32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: max.s32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: max.s32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: max.s32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: max.s32 %r12, %r4, %r11;
-; CHECK-O0-NEXT: max.s32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smax_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -2203,169 +1511,72 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
-; CHECK-NEXT: max.s32 %r8, %r3, %r7;
-; CHECK-NEXT: max.s32 %r9, %r1, %r5;
-; CHECK-NEXT: max.s32 %r10, %r9, %r8;
-; CHECK-NEXT: max.s32 %r11, %r2, %r6;
-; CHECK-NEXT: max.s32 %r12, %r4, %r11;
-; CHECK-NEXT: max.s32 %r13, %r10, %r12;
+; CHECK-NEXT: max.s32 %r8, %r5, %r6;
+; CHECK-NEXT: max.s32 %r9, %r8, %r7;
+; CHECK-NEXT: max.s32 %r10, %r3, %r4;
+; CHECK-NEXT: max.s32 %r11, %r1, %r2;
+; CHECK-NEXT: max.s32 %r12, %r11, %r10;
+; CHECK-NEXT: max.s32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smax(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_smin_i16(<8 x i16> %in) {
-;
-; CHECK-SM80-LABEL: reduce_smin_i16(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<16>;
-; CHECK-SM80-NEXT: .reg .b32 %r<6>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
-; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM80-NEXT: min.s16 %rs5, %rs4, %rs2;
-; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3;
-; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1;
-; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs7;
-; CHECK-SM80-NEXT: min.s16 %rs11, %rs10, %rs5;
-; CHECK-SM80-NEXT: min.s16 %rs12, %rs3, %rs1;
-; CHECK-SM80-NEXT: min.s16 %rs13, %rs8, %rs6;
-; CHECK-SM80-NEXT: min.s16 %rs14, %rs13, %rs12;
-; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs11;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_smin_i16(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
-; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4;
-; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3;
-; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7;
-; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_smin_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.s16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.s16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.s16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.s16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.s16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.s16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.s16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
-;
-;
-; CHECK-SM80-O0-LABEL: reduce_smin_i16_nonpow2(
-; CHECK-SM80-O0: {
-; CHECK-SM80-O0-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-O0-EMPTY:
-; CHECK-SM80-O0-NEXT: // %bb.0:
-; CHECK-SM80-O0-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
-; CHECK-SM80-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-O0-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
-; CHECK-SM80-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
-; CHECK-SM80-O0-NEXT: min.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-O0-NEXT: min.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-O0-NEXT: min.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-O0-NEXT: min.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-O0-NEXT: min.s16 %rs12, %rs4, %rs11;
-; CHECK-SM80-O0-NEXT: min.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-O0-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-O0-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-O0-NEXT: ret;
-;
-; CHECK-SM100-O0-LABEL: reduce_smin_i16_nonpow2(
-; CHECK-SM100-O0: {
-; CHECK-SM100-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-O0-EMPTY:
-; CHECK-SM100-O0-NEXT: // %bb.0:
-; CHECK-SM100-O0-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
-; CHECK-SM100-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-O0-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
-; CHECK-SM100-O0-NEXT: mov.b16 %rs8, 32767;
-; CHECK-SM100-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-O0-NEXT: min.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-O0-NEXT: min.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-O0-NEXT: min.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-O0-NEXT: min.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-O0-NEXT: ret;
-; CHECK-SM80-LABEL: reduce_smin_i16_nonpow2(
-; CHECK-SM80: {
-; CHECK-SM80-NEXT: .reg .b16 %rs<14>;
-; CHECK-SM80-NEXT: .reg .b32 %r<3>;
-; CHECK-SM80-EMPTY:
-; CHECK-SM80-NEXT: // %bb.0:
-; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
-; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
-; CHECK-SM80-NEXT: min.s16 %rs8, %rs3, %rs7;
-; CHECK-SM80-NEXT: min.s16 %rs9, %rs1, %rs5;
-; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs8;
-; CHECK-SM80-NEXT: min.s16 %rs11, %rs2, %rs6;
-; CHECK-SM80-NEXT: min.s16 %rs12, %rs4, %rs11;
-; CHECK-SM80-NEXT: min.s16 %rs13, %rs10, %rs12;
-; CHECK-SM80-NEXT: cvt.u32.u16 %r2, %rs13;
-; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-SM80-NEXT: ret;
-;
-; CHECK-SM100-LABEL: reduce_smin_i16_nonpow2(
-; CHECK-SM100: {
-; CHECK-SM100-NEXT: .reg .b16 %rs<12>;
-; CHECK-SM100-NEXT: .reg .b32 %r<9>;
-; CHECK-SM100-EMPTY:
-; CHECK-SM100-NEXT: // %bb.0:
-; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
-; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
-; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
-; CHECK-SM100-NEXT: mov.b16 %rs8, 32767;
-; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-SM100-NEXT: min.s16x2 %r5, %r3, %r4;
-; CHECK-SM100-NEXT: min.s16x2 %r6, %r2, %r1;
-; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5;
-; CHECK-SM100-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-SM100-NEXT: min.s16 %rs11, %rs9, %rs10;
-; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-SM100-NEXT: ret;
+; CHECK-LABEL: reduce_smin_i16_nonpow2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-NEXT: min.s16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: min.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: min.s16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: min.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: min.s16 %rs12, %rs7, %rs11;
+; CHECK-NEXT: min.s16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.smin(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_smin_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_smin_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
-; CHECK-O0-NEXT: min.s32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: min.s32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: min.s32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: min.s32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: min.s32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: min.s32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: min.s32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smin_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -2373,11 +1584,11 @@ define i32 @reduce_smin_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
-; CHECK-NEXT: min.s32 %r9, %r4, %r8;
-; CHECK-NEXT: min.s32 %r10, %r2, %r6;
+; CHECK-NEXT: min.s32 %r9, %r7, %r8;
+; CHECK-NEXT: min.s32 %r10, %r5, %r6;
; CHECK-NEXT: min.s32 %r11, %r10, %r9;
-; CHECK-NEXT: min.s32 %r12, %r3, %r7;
-; CHECK-NEXT: min.s32 %r13, %r1, %r5;
+; CHECK-NEXT: min.s32 %r12, %r3, %r4;
+; CHECK-NEXT: min.s32 %r13, %r1, %r2;
; CHECK-NEXT: min.s32 %r14, %r13, %r12;
; CHECK-NEXT: min.s32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -2386,23 +1597,8 @@ define i32 @reduce_smin_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_smin_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: min.s32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: min.s32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: min.s32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: min.s32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: min.s32 %r12, %r4, %r11;
-; CHECK-O0-NEXT: min.s32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_smin_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -2411,104 +1607,72 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
-; CHECK-NEXT: min.s32 %r8, %r3, %r7;
-; CHECK-NEXT: min.s32 %r9, %r1, %r5;
-; CHECK-NEXT: min.s32 %r10, %r9, %r8;
-; CHECK-NEXT: min.s32 %r11, %r2, %r6;
-; CHECK-NEXT: min.s32 %r12, %r4, %r11;
-; CHECK-NEXT: min.s32 %r13, %r10, %r12;
+; CHECK-NEXT: min.s32 %r8, %r5, %r6;
+; CHECK-NEXT: min.s32 %r9, %r8, %r7;
+; CHECK-NEXT: min.s32 %r10, %r3, %r4;
+; CHECK-NEXT: min.s32 %r11, %r1, %r2;
+; CHECK-NEXT: min.s32 %r12, %r11, %r10;
+; CHECK-NEXT: min.s32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.smin(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_and_i16(<8 x i16> %in) {
; CHECK-LABEL: reduce_and_i16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
-; CHECK-NEXT: and.b32 %r5, %r2, %r4;
-; CHECK-NEXT: and.b32 %r6, %r1, %r3;
-; CHECK-NEXT: and.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: and.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: and.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: and.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: and.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: and.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.and(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
-; CHECK-O0-LABEL: reduce_and_i16_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_and_i16_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
-; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
-; CHECK-O0-NEXT: mov.b16 %rs8, -1;
-; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-O0-NEXT: and.b32 %r5, %r3, %r4;
-; CHECK-O0-NEXT: and.b32 %r6, %r2, %r1;
-; CHECK-O0-NEXT: and.b32 %r7, %r6, %r5;
-; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-O0-NEXT: and.b16 %rs11, %rs9, %rs10;
-; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i16_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_and_i16_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, -1;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: and.b32 %r5, %r3, %r4;
-; CHECK-NEXT: and.b32 %r6, %r2, %r1;
-; CHECK-NEXT: and.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: and.b16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
+; CHECK-NEXT: and.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: and.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: and.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: and.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: and.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: and.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.and(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_and_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_and_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
-; CHECK-O0-NEXT: and.b32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: and.b32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: and.b32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: and.b32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: and.b32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: and.b32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: and.b32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -2516,11 +1680,11 @@ define i32 @reduce_and_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
-; CHECK-NEXT: and.b32 %r9, %r4, %r8;
-; CHECK-NEXT: and.b32 %r10, %r2, %r6;
+; CHECK-NEXT: and.b32 %r9, %r7, %r8;
+; CHECK-NEXT: and.b32 %r10, %r5, %r6;
; CHECK-NEXT: and.b32 %r11, %r10, %r9;
-; CHECK-NEXT: and.b32 %r12, %r3, %r7;
-; CHECK-NEXT: and.b32 %r13, %r1, %r5;
+; CHECK-NEXT: and.b32 %r12, %r3, %r4;
+; CHECK-NEXT: and.b32 %r13, %r1, %r2;
; CHECK-NEXT: and.b32 %r14, %r13, %r12;
; CHECK-NEXT: and.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -2529,23 +1693,8 @@ define i32 @reduce_and_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_and_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: and.b32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: and.b32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: and.b32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: and.b32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: and.b32 %r12, %r11, %r4;
-; CHECK-O0-NEXT: and.b32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_and_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -2554,104 +1703,72 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
-; CHECK-NEXT: and.b32 %r8, %r3, %r7;
-; CHECK-NEXT: and.b32 %r9, %r1, %r5;
-; CHECK-NEXT: and.b32 %r10, %r9, %r8;
-; CHECK-NEXT: and.b32 %r11, %r2, %r6;
-; CHECK-NEXT: and.b32 %r12, %r11, %r4;
-; CHECK-NEXT: and.b32 %r13, %r10, %r12;
+; CHECK-NEXT: and.b32 %r8, %r5, %r6;
+; CHECK-NEXT: and.b32 %r9, %r8, %r7;
+; CHECK-NEXT: and.b32 %r10, %r3, %r4;
+; CHECK-NEXT: and.b32 %r11, %r1, %r2;
+; CHECK-NEXT: and.b32 %r12, %r11, %r10;
+; CHECK-NEXT: and.b32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.and(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_or_i16(<8 x i16> %in) {
; CHECK-LABEL: reduce_or_i16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
-; CHECK-NEXT: or.b32 %r5, %r2, %r4;
-; CHECK-NEXT: or.b32 %r6, %r1, %r3;
-; CHECK-NEXT: or.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: or.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: or.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: or.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: or.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: or.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: or.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.or(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
-; CHECK-O0-LABEL: reduce_or_i16_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_or_i16_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
-; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
-; CHECK-O0-NEXT: mov.b16 %rs8, 0;
-; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-O0-NEXT: or.b32 %r5, %r3, %r4;
-; CHECK-O0-NEXT: or.b32 %r6, %r2, %r1;
-; CHECK-O0-NEXT: or.b32 %r7, %r6, %r5;
-; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-O0-NEXT: or.b16 %rs11, %rs9, %rs10;
-; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i16_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_or_i16_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: or.b32 %r5, %r3, %r4;
-; CHECK-NEXT: or.b32 %r6, %r2, %r1;
-; CHECK-NEXT: or.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: or.b16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
+; CHECK-NEXT: or.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: or.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: or.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: or.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: or.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.or(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_or_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_or_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
-; CHECK-O0-NEXT: or.b32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: or.b32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: or.b32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: or.b32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: or.b32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: or.b32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: or.b32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -2659,11 +1776,11 @@ define i32 @reduce_or_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
-; CHECK-NEXT: or.b32 %r9, %r4, %r8;
-; CHECK-NEXT: or.b32 %r10, %r2, %r6;
+; CHECK-NEXT: or.b32 %r9, %r7, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r6;
; CHECK-NEXT: or.b32 %r11, %r10, %r9;
-; CHECK-NEXT: or.b32 %r12, %r3, %r7;
-; CHECK-NEXT: or.b32 %r13, %r1, %r5;
+; CHECK-NEXT: or.b32 %r12, %r3, %r4;
+; CHECK-NEXT: or.b32 %r13, %r1, %r2;
; CHECK-NEXT: or.b32 %r14, %r13, %r12;
; CHECK-NEXT: or.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -2672,23 +1789,8 @@ define i32 @reduce_or_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_or_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: or.b32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: or.b32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: or.b32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: or.b32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: or.b32 %r12, %r11, %r4;
-; CHECK-O0-NEXT: or.b32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_or_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -2697,104 +1799,72 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
-; CHECK-NEXT: or.b32 %r8, %r3, %r7;
-; CHECK-NEXT: or.b32 %r9, %r1, %r5;
-; CHECK-NEXT: or.b32 %r10, %r9, %r8;
-; CHECK-NEXT: or.b32 %r11, %r2, %r6;
-; CHECK-NEXT: or.b32 %r12, %r11, %r4;
-; CHECK-NEXT: or.b32 %r13, %r10, %r12;
+; CHECK-NEXT: or.b32 %r8, %r5, %r6;
+; CHECK-NEXT: or.b32 %r9, %r8, %r7;
+; CHECK-NEXT: or.b32 %r10, %r3, %r4;
+; CHECK-NEXT: or.b32 %r11, %r1, %r2;
+; CHECK-NEXT: or.b32 %r12, %r11, %r10;
+; CHECK-NEXT: or.b32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.or(<7 x i32> %in)
ret i32 %res
}
+; Check tree reduction.
define i16 @reduce_xor_i16(<8 x i16> %in) {
; CHECK-LABEL: reduce_xor_i16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
-; CHECK-NEXT: xor.b32 %r5, %r2, %r4;
-; CHECK-NEXT: xor.b32 %r6, %r1, %r3;
-; CHECK-NEXT: xor.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs3;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: xor.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: xor.b16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: xor.b16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: xor.b16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: xor.b16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: xor.b16 %rs15, %rs14, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs15;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in)
ret i16 %res
}
+; Check tree reduction with non-power of 2 size.
define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
-; CHECK-O0-LABEL: reduce_xor_i16_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b16 %rs<12>;
-; CHECK-O0-NEXT: .reg .b32 %r<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
-; CHECK-O0-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
-; CHECK-O0-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-O0-NEXT: mov.b32 %r3, {%rs3, %rs4};
-; CHECK-O0-NEXT: ld.param.b16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
-; CHECK-O0-NEXT: mov.b16 %rs8, 0;
-; CHECK-O0-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-O0-NEXT: xor.b32 %r5, %r3, %r4;
-; CHECK-O0-NEXT: xor.b32 %r6, %r2, %r1;
-; CHECK-O0-NEXT: xor.b32 %r7, %r6, %r5;
-; CHECK-O0-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-O0-NEXT: xor.b16 %rs11, %rs9, %rs10;
-; CHECK-O0-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r8;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i16_nonpow2(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b16 %rs<14>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4};
; CHECK-NEXT: ld.param.b16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
-; CHECK-NEXT: mov.b16 %rs8, 0;
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: xor.b32 %r5, %r3, %r4;
-; CHECK-NEXT: xor.b32 %r6, %r2, %r1;
-; CHECK-NEXT: xor.b32 %r7, %r6, %r5;
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7;
-; CHECK-NEXT: xor.b16 %rs11, %rs9, %rs10;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
+; CHECK-NEXT: xor.b16 %rs8, %rs3, %rs4;
+; CHECK-NEXT: xor.b16 %rs9, %rs1, %rs2;
+; CHECK-NEXT: xor.b16 %rs10, %rs9, %rs8;
+; CHECK-NEXT: xor.b16 %rs11, %rs5, %rs6;
+; CHECK-NEXT: xor.b16 %rs12, %rs11, %rs7;
+; CHECK-NEXT: xor.b16 %rs13, %rs10, %rs12;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs13;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%res = call i16 @llvm.vector.reduce.xor(<7 x i16> %in)
ret i16 %res
}
+; Check tree reduction.
define i32 @reduce_xor_i32(<8 x i32> %in) {
-; CHECK-O0-LABEL: reduce_xor_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<16>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
-; CHECK-O0-NEXT: xor.b32 %r9, %r4, %r8;
-; CHECK-O0-NEXT: xor.b32 %r10, %r2, %r6;
-; CHECK-O0-NEXT: xor.b32 %r11, %r10, %r9;
-; CHECK-O0-NEXT: xor.b32 %r12, %r3, %r7;
-; CHECK-O0-NEXT: xor.b32 %r13, %r1, %r5;
-; CHECK-O0-NEXT: xor.b32 %r14, %r13, %r12;
-; CHECK-O0-NEXT: xor.b32 %r15, %r14, %r11;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r15;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<16>;
@@ -2802,11 +1872,11 @@ define i32 @reduce_xor_i32(<8 x i32> %in) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
-; CHECK-NEXT: xor.b32 %r9, %r4, %r8;
-; CHECK-NEXT: xor.b32 %r10, %r2, %r6;
+; CHECK-NEXT: xor.b32 %r9, %r7, %r8;
+; CHECK-NEXT: xor.b32 %r10, %r5, %r6;
; CHECK-NEXT: xor.b32 %r11, %r10, %r9;
-; CHECK-NEXT: xor.b32 %r12, %r3, %r7;
-; CHECK-NEXT: xor.b32 %r13, %r1, %r5;
+; CHECK-NEXT: xor.b32 %r12, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r13, %r1, %r2;
; CHECK-NEXT: xor.b32 %r14, %r13, %r12;
; CHECK-NEXT: xor.b32 %r15, %r14, %r11;
; CHECK-NEXT: st.param.b32 [func_retval0], %r15;
@@ -2815,23 +1885,8 @@ define i32 @reduce_xor_i32(<8 x i32> %in) {
ret i32 %res
}
+; Check tree reduction with non-power of 2 size.
define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) {
-; CHECK-O0-LABEL: reduce_xor_i32_nonpow2(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<14>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
-; CHECK-O0-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
-; CHECK-O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
-; CHECK-O0-NEXT: xor.b32 %r8, %r3, %r7;
-; CHECK-O0-NEXT: xor.b32 %r9, %r1, %r5;
-; CHECK-O0-NEXT: xor.b32 %r10, %r9, %r8;
-; CHECK-O0-NEXT: xor.b32 %r11, %r2, %r6;
-; CHECK-O0-NEXT: xor.b32 %r12, %r11, %r4;
-; CHECK-O0-NEXT: xor.b32 %r13, %r10, %r12;
-; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r13;
-; CHECK-O0-NEXT: ret;
; CHECK-LABEL: reduce_xor_i32_nonpow2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<14>;
@@ -2840,12 +1895,12 @@ define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) {
; CHECK-NEXT: ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
-; CHECK-NEXT: xor.b32 %r8, %r3, %r7;
-; CHECK-NEXT: xor.b32 %r9, %r1, %r5;
-; CHECK-NEXT: xor.b32 %r10, %r9, %r8;
-; CHECK-NEXT: xor.b32 %r11, %r2, %r6;
-; CHECK-NEXT: xor.b32 %r12, %r11, %r4;
-; CHECK-NEXT: xor.b32 %r13, %r10, %r12;
+; CHECK-NEXT: xor.b32 %r8, %r5, %r6;
+; CHECK-NEXT: xor.b32 %r9, %r8, %r7;
+; CHECK-NEXT: xor.b32 %r10, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r11, %r1, %r2;
+; CHECK-NEXT: xor.b32 %r12, %r11, %r10;
+; CHECK-NEXT: xor.b32 %r13, %r12, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
; CHECK-NEXT: ret;
%res = call i32 @llvm.vector.reduce.xor(<7 x i32> %in)
>From 1a01686f7d24d8548f061af55d96464655d7389c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 17 Apr 2025 21:30:20 -0700
Subject: [PATCH 3/4] [NVPTX] support VECREDUCE_SEQ ops and remove option
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 26 +++++++---
.../CodeGen/NVPTX/reduction-intrinsics.ll | 48 +++++++++----------
2 files changed, 44 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 45dd661613d81..f45b4cd7a2e22 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -876,6 +876,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
EltVT == MVT::f64) {
setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+ ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL,
ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
VT, Custom);
@@ -2270,12 +2271,19 @@ static SDValue BuildTreeReduction(
/// max3/min3 when the target supports them.
SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
- if (DisableFOpTreeReduce)
- return SDValue();
-
SDLoc DL(Op);
const SDNodeFlags Flags = Op->getFlags();
- const SDValue &Vector = Op.getOperand(0);
+ SDValue Vector;
+ SDValue Accumulator;
+ if (Op->getOpcode() == ISD::VECREDUCE_SEQ_FADD ||
+ Op->getOpcode() == ISD::VECREDUCE_SEQ_FMUL) {
+ // special case with accumulator as first arg
+ Accumulator = Op.getOperand(0);
+ Vector = Op.getOperand(1);
+ } else {
+ // default case
+ Vector = Op.getOperand(0);
+ }
EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;
@@ -2287,10 +2295,12 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_SEQ_FADD:
ScalarOps = {{ISD::FADD, 2}};
IsReassociatable = false;
break;
case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_SEQ_FMUL:
ScalarOps = {{ISD::FMUL, 2}};
IsReassociatable = false;
break;
@@ -2369,11 +2379,13 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
}
// Lower to tree reduction.
- if (IsReassociatable || Flags.hasAllowReassociation())
+ if (IsReassociatable || Flags.hasAllowReassociation()) {
+ // we don't expect an accumulator for reassociatable vector reduction ops
+ assert(!Accumulator && "unexpected accumulator");
return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+ }
// Lower to sequential reduction.
- SDValue Accumulator;
for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
@@ -3234,6 +3246,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_SEQ_FADD:
+ case ISD::VECREDUCE_SEQ_FMUL:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 72cd7e38bda9f..c8b1238a24752 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -23,19 +23,19 @@ define half @reduce_fadd_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT: mov.b16 %rs9, 0x0000;
-; CHECK-NEXT: add.rn.f16 %rs10, %rs7, %rs9;
-; CHECK-NEXT: add.rn.f16 %rs11, %rs10, %rs8;
-; CHECK-NEXT: add.rn.f16 %rs12, %rs11, %rs5;
-; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs6;
-; CHECK-NEXT: add.rn.f16 %rs14, %rs13, %rs3;
-; CHECK-NEXT: add.rn.f16 %rs15, %rs14, %rs4;
-; CHECK-NEXT: add.rn.f16 %rs16, %rs15, %rs1;
-; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs2;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: mov.b16 %rs3, 0x0000;
+; CHECK-NEXT: add.rn.f16 %rs4, %rs1, %rs3;
+; CHECK-NEXT: add.rn.f16 %rs5, %rs4, %rs2;
+; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r2;
+; CHECK-NEXT: add.rn.f16 %rs8, %rs5, %rs6;
+; CHECK-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: mov.b32 {%rs10, %rs11}, %r3;
+; CHECK-NEXT: add.rn.f16 %rs12, %rs9, %rs10;
+; CHECK-NEXT: add.rn.f16 %rs13, %rs12, %rs11;
+; CHECK-NEXT: mov.b32 {%rs14, %rs15}, %r4;
+; CHECK-NEXT: add.rn.f16 %rs16, %rs13, %rs14;
+; CHECK-NEXT: add.rn.f16 %rs17, %rs16, %rs15;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs17;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in)
@@ -174,17 +174,17 @@ define half @reduce_fmul_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2;
-; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT: mul.rn.f16 %rs9, %rs7, %rs8;
-; CHECK-NEXT: mul.rn.f16 %rs10, %rs9, %rs5;
-; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs6;
-; CHECK-NEXT: mul.rn.f16 %rs12, %rs11, %rs3;
-; CHECK-NEXT: mul.rn.f16 %rs13, %rs12, %rs4;
-; CHECK-NEXT: mul.rn.f16 %rs14, %rs13, %rs1;
-; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs2;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT: mul.rn.f16 %rs5, %rs3, %rs4;
+; CHECK-NEXT: mul.rn.f16 %rs6, %rs5, %rs1;
+; CHECK-NEXT: mul.rn.f16 %rs7, %rs6, %rs2;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
+; CHECK-NEXT: mul.rn.f16 %rs10, %rs7, %rs8;
+; CHECK-NEXT: mul.rn.f16 %rs11, %rs10, %rs9;
+; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
+; CHECK-NEXT: mul.rn.f16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: mul.rn.f16 %rs15, %rs14, %rs13;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in)
>From b13ed82ea904348fca9835643ca6d07396287c3e Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 22 Apr 2025 21:43:05 -0700
Subject: [PATCH 4/4] [NVPTX] expand associativity to fmax / fmin and add
comments
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 74 +++++---
.../CodeGen/NVPTX/reduction-intrinsics.ll | 168 +++++++++---------
2 files changed, 133 insertions(+), 109 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f45b4cd7a2e22..5087ca72e0d17 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2225,19 +2225,25 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
}
/// A generic routine for constructing a tree reduction on a vector operand.
-/// This method differs from iterative splitting in DAGTypeLegalizer by
-/// progressively grouping elements bottom-up.
+/// This method groups elements bottom-up, progressively building each level.
+/// This approach differs from top-down iterative splitting used in
+/// DAGTypeLegalizer and ExpandReductions.
+///
+/// Also, the flags on the original reduction operation will be propagated to
+/// each scalar operation.
static SDValue BuildTreeReduction(
const SmallVector<SDValue> &Elements, EVT EltTy,
ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
- // now build the computation graph in place at each level
+ // Build the reduction tree at each level, starting with all the elements.
SmallVector<SDValue> Level = Elements;
+
unsigned OpIdx = 0;
while (Level.size() > 1) {
+ // Try to reduce this level using the current operator.
const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
- // partially reduce all elements in level
+ // Build the next level by partially reducing all elements.
SmallVector<SDValue> ReducedLevel;
unsigned I = 0, E = Level.size();
for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
@@ -2248,18 +2254,23 @@ static SDValue BuildTreeReduction(
}
if (I < E) {
+ // We have leftover elements. Why?
+
if (ReducedLevel.empty()) {
- // The current operator requires more inputs than there are operands at
- // this level. Pick a smaller operator and retry.
+ // ...because this level is now so small that the current operator is
+ // too big for it. Pick a smaller operator and retry.
++OpIdx;
assert(OpIdx < Ops.size() && "no smaller operators for reduction");
continue;
}
- // Otherwise, we just have a remainder, which we push to the next level.
+ // ...because the operator's required number of inputs doesn't divide
+ // evenly this level. We push this remainder to the next level.
for (; I < E; ++I)
ReducedLevel.push_back(Level[I]);
}
+
+ // Process the next level.
Level = ReducedLevel;
}
@@ -2275,6 +2286,7 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
const SDNodeFlags Flags = Op->getFlags();
SDValue Vector;
SDValue Accumulator;
+
if (Op->getOpcode() == ISD::VECREDUCE_SEQ_FADD ||
Op->getOpcode() == ISD::VECREDUCE_SEQ_FMUL) {
// special case with accumulator as first arg
@@ -2284,6 +2296,7 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
// default case
Vector = Op.getOperand(0);
}
+
EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;
@@ -2291,78 +2304,86 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
// A list of SDNode opcodes with equivalent semantics, sorted descending by
// number of inputs they take.
SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
- bool IsReassociatable;
+
+ // Whether we can lower to scalar operations in an arbitrary order.
+ bool IsAssociative;
switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_SEQ_FADD:
ScalarOps = {{ISD::FADD, 2}};
- IsReassociatable = false;
+ IsAssociative = Op->getOpcode() == ISD::VECREDUCE_FADD;
break;
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_SEQ_FMUL:
ScalarOps = {{ISD::FMUL, 2}};
- IsReassociatable = false;
+ IsAssociative = Op->getOpcode() == ISD::VECREDUCE_FMUL;
break;
case ISD::VECREDUCE_FMAX:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3});
ScalarOps.push_back({ISD::FMAXNUM, 2});
- IsReassociatable = false;
+ // Definition of maxNum in IEEE 754 2008 is non-associative, but only
+ // because of how sNaNs are treated. However, NVIDIA GPUs don't support
+ // sNaNs.
+ IsAssociative = true;
break;
case ISD::VECREDUCE_FMIN:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMINNUM3, 3});
ScalarOps.push_back({ISD::FMINNUM, 2});
- IsReassociatable = false;
+ // Definition of minNum in IEEE 754 2008 is non-associative, but only
+ // because of how sNaNs are treated. However, NVIDIA GPUs don't support
+ // sNaNs.
+ IsAssociative = true;
break;
case ISD::VECREDUCE_FMAXIMUM:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3});
ScalarOps.push_back({ISD::FMAXIMUM, 2});
- IsReassociatable = false;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_FMINIMUM:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3});
ScalarOps.push_back({ISD::FMINIMUM, 2});
- IsReassociatable = false;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_ADD:
ScalarOps = {{ISD::ADD, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_MUL:
ScalarOps = {{ISD::MUL, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_UMAX:
ScalarOps = {{ISD::UMAX, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_UMIN:
ScalarOps = {{ISD::UMIN, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_SMAX:
ScalarOps = {{ISD::SMAX, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_SMIN:
ScalarOps = {{ISD::SMIN, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_AND:
ScalarOps = {{ISD::AND, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_OR:
ScalarOps = {{ISD::OR, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
case ISD::VECREDUCE_XOR:
ScalarOps = {{ISD::XOR, 2}};
- IsReassociatable = true;
+ IsAssociative = true;
break;
default:
llvm_unreachable("unhandled vecreduce operation");
@@ -2379,18 +2400,21 @@ SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
}
// Lower to tree reduction.
- if (IsReassociatable || Flags.hasAllowReassociation()) {
- // we don't expect an accumulator for reassociatable vector reduction ops
+ if (IsAssociative || allowUnsafeFPMath(DAG.getMachineFunction())) {
+ // we don't expect an accumulator for reassociative vector reduction ops
assert(!Accumulator && "unexpected accumulator");
return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
}
// Lower to sequential reduction.
for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
+ // Try to reduce the remaining sequence as much as possible using the
+ // current operator.
assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
if (!Accumulator) {
+ // Try to initialize the accumulator using the current operator.
if (I + DefaultGroupSize <= NumElts) {
Accumulator = DAG.getNode(
DefaultScalarOp, DL, EltTy,
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index c8b1238a24752..0ba69266fbd7a 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -316,17 +316,17 @@ define half @reduce_fmax_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: max.f16 %rs5, %rs3, %rs4;
-; CHECK-NEXT: max.f16 %rs6, %rs5, %rs1;
-; CHECK-NEXT: max.f16 %rs7, %rs6, %rs2;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
-; CHECK-NEXT: max.f16 %rs10, %rs7, %rs8;
-; CHECK-NEXT: max.f16 %rs11, %rs10, %rs9;
-; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
-; CHECK-NEXT: max.f16 %rs14, %rs11, %rs12;
-; CHECK-NEXT: max.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.f16 %rs15, %rs14, %rs7;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmax(<8 x half> %in)
@@ -392,13 +392,13 @@ define float @reduce_fmax_float(<8 x float> %in) {
; CHECK-SM80-NEXT: // %bb.0:
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
-; CHECK-SM80-NEXT: max.f32 %r9, %r1, %r2;
-; CHECK-SM80-NEXT: max.f32 %r10, %r9, %r3;
-; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r4;
-; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r5;
-; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r6;
-; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r7;
-; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-SM80-NEXT: ret;
;
@@ -409,9 +409,9 @@ define float @reduce_fmax_float(<8 x float> %in) {
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
-; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3;
-; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r4, %r5;
-; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: max.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r7;
; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r8;
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-SM100-NEXT: ret;
@@ -500,17 +500,17 @@ define half @reduce_fmin_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: min.f16 %rs5, %rs3, %rs4;
-; CHECK-NEXT: min.f16 %rs6, %rs5, %rs1;
-; CHECK-NEXT: min.f16 %rs7, %rs6, %rs2;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
-; CHECK-NEXT: min.f16 %rs10, %rs7, %rs8;
-; CHECK-NEXT: min.f16 %rs11, %rs10, %rs9;
-; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
-; CHECK-NEXT: min.f16 %rs14, %rs11, %rs12;
-; CHECK-NEXT: min.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.f16 %rs15, %rs14, %rs7;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmin(<8 x half> %in)
@@ -576,13 +576,13 @@ define float @reduce_fmin_float(<8 x float> %in) {
; CHECK-SM80-NEXT: // %bb.0:
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
-; CHECK-SM80-NEXT: min.f32 %r9, %r1, %r2;
-; CHECK-SM80-NEXT: min.f32 %r10, %r9, %r3;
-; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r4;
-; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r5;
-; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r6;
-; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r7;
-; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-SM80-NEXT: ret;
;
@@ -593,9 +593,9 @@ define float @reduce_fmin_float(<8 x float> %in) {
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
-; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3;
-; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r4, %r5;
-; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: min.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r7;
; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r8;
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-SM100-NEXT: ret;
@@ -684,17 +684,17 @@ define half @reduce_fmaximum_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: max.NaN.f16 %rs5, %rs3, %rs4;
-; CHECK-NEXT: max.NaN.f16 %rs6, %rs5, %rs1;
-; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs2;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
-; CHECK-NEXT: max.NaN.f16 %rs10, %rs7, %rs8;
-; CHECK-NEXT: max.NaN.f16 %rs11, %rs10, %rs9;
-; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
-; CHECK-NEXT: max.NaN.f16 %rs14, %rs11, %rs12;
-; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: max.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: max.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: max.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: max.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: max.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: max.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: max.NaN.f16 %rs15, %rs14, %rs7;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fmaximum(<8 x half> %in)
@@ -760,13 +760,13 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
; CHECK-SM80-NEXT: // %bb.0:
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
-; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r1, %r2;
-; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r9, %r3;
-; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r4;
-; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r11, %r5;
-; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r12, %r6;
-; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r7;
-; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-SM80-NEXT: ret;
;
@@ -777,9 +777,9 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
-; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r1, %r2, %r3;
-; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r9, %r4, %r5;
-; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r7;
; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r8;
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-SM100-NEXT: ret;
@@ -868,17 +868,17 @@ define half @reduce_fminimum_half(<8 x half> %in) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT: min.NaN.f16 %rs5, %rs3, %rs4;
-; CHECK-NEXT: min.NaN.f16 %rs6, %rs5, %rs1;
-; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs2;
-; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r3;
-; CHECK-NEXT: min.NaN.f16 %rs10, %rs7, %rs8;
-; CHECK-NEXT: min.NaN.f16 %rs11, %rs10, %rs9;
-; CHECK-NEXT: mov.b32 {%rs12, %rs13}, %r4;
-; CHECK-NEXT: min.NaN.f16 %rs14, %rs11, %rs12;
-; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs13;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: min.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r3;
+; CHECK-NEXT: min.NaN.f16 %rs6, %rs4, %rs5;
+; CHECK-NEXT: min.NaN.f16 %rs7, %rs6, %rs3;
+; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r2;
+; CHECK-NEXT: min.NaN.f16 %rs10, %rs8, %rs9;
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r1;
+; CHECK-NEXT: min.NaN.f16 %rs13, %rs11, %rs12;
+; CHECK-NEXT: min.NaN.f16 %rs14, %rs13, %rs10;
+; CHECK-NEXT: min.NaN.f16 %rs15, %rs14, %rs7;
; CHECK-NEXT: st.param.b16 [func_retval0], %rs15;
; CHECK-NEXT: ret;
%res = call half @llvm.vector.reduce.fminimum(<8 x half> %in)
@@ -944,13 +944,13 @@ define float @reduce_fminimum_float(<8 x float> %in) {
; CHECK-SM80-NEXT: // %bb.0:
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
-; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r1, %r2;
-; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r9, %r3;
-; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r4;
-; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r11, %r5;
-; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r12, %r6;
-; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r7;
-; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r8;
+; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8;
+; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6;
+; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9;
+; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4;
+; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2;
+; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12;
+; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11;
; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15;
; CHECK-SM80-NEXT: ret;
;
@@ -961,9 +961,9 @@ define float @reduce_fminimum_float(<8 x float> %in) {
; CHECK-SM100-NEXT: // %bb.0:
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
-; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r1, %r2, %r3;
-; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r9, %r4, %r5;
-; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r6, %r7;
+; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r4, %r5, %r6;
+; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r1, %r2, %r3;
+; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r7;
; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r8;
; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-SM100-NEXT: ret;
More information about the llvm-commits
mailing list