[llvm] [LLVM][NVPTX] Add support for tcgen05.ld.red Instruction (PR #175919)
Pradeep Kumar via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 14 17:30:20 PST 2026
https://github.com/schwarzschild-radius updated https://github.com/llvm/llvm-project/pull/175919
>From 313bf84427300079eab966639de52d245c934f6c Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Tue, 13 Jan 2026 18:02:12 +0000
Subject: [PATCH] [LLVM][NVPTX] Add support for tcgen05.ld.red Instruction
This commit adds support for tcgen05.ld.red instruction with tests under tcgen05-ld-red.ll
---
llvm/docs/NVPTXUsage.rst | 50 ++-
llvm/include/llvm/IR/IntrinsicsNVVM.td | 44 ++-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 264 ++++++++++++-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 102 +++++
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 5 +
llvm/test/CodeGen/NVPTX/tcgen05-ld-red.ll | 389 ++++++++++++++++++++
6 files changed, 823 insertions(+), 31 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-ld-red.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 21cbde1b4a706..8d0952b557dae 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -2806,30 +2806,42 @@ Syntax:
declare <n x i32> @llvm.nvvm.tcgen05.ld.<shape>.<num>(ptr addrspace(6) %tmem_addr, i1 %pack)
+ declare <n x i32> @llvm.nvvm.tcgen05.ld.red.32x32b.<num>.i32(ptr addrspace(6) %tmem_addr, i32 %redOp)
+
+ declare <n x i32> @llvm.nvvm.tcgen05.ld.red.32x32b.<num>.f32(ptr addrspace(6) %tmem_addr, i32 %redOp, i1 %abs, i1 %nan)
+
declare <n x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, i64 %offset, i1 %pack)
+ declare <n x i32> @llvm.nvvm.tcgen05.ld.red.16x32bx2.<num>.i32(ptr addrspace(6) %tmem_addr, i64 %offset, i32 %redOp)
+
+ declare <n x i32> @llvm.nvvm.tcgen05.ld.red.16x32bx2.<num>.f32(ptr addrspace(6) %tmem_addr, i64 %offset, i32 %redOp, i1 %abs, i1 %nan)
+
Overview:
"""""""""
-This group of intrinsics asynchronously load data from the Tensor Memory at the location specified
-by the 32-bit address operand `tmem_addr` into the destination registers, collectively across all threads
-of the warps.
+This group of intrinsics asynchronously load data from the Tensor Memory at the
+location specified by the 32-bit address operand `tmem_addr` into the destination
+registers, collectively all threads of the warps.
-All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address
-of the collective load operation. Otherwise, the behavior is undefined.
+All the threads in the warp must specify the same value of `tmem_addr`, which must
+be the base address of the collective load operation. Otherwise, the behavior is
+undefined.
-The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
-is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
-indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
+The `shape` qualifier and the `num` qualifier together determines the total
+dimension of the data ('n') which is loaded from the Tensor Memory. The `shape`
+qualifier indicates the base dimension of data. The `num` qualifier indicates the
+repeat factor on the base dimension resulting in the total dimension of the data
+that is accessed.
-Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
+Allowed values for the `num` are `x1, x2, x4, x8, x16, x32, x64, x128` except for
+ `tcgen05.ld.red` which does not support `x1`
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
-The result of the intrinsic is a vector consisting of one or more 32-bit registers derived from `shape` and
-`num` as shown below.
+The result of the intrinsic is a vector consisting of one or more 32-bit registers
+derived from `shape` and `num` as shown below.
=========== ========================= ========== ==========
num/shape 16x32bx2/16x64b/32x32b 16x128b 16x256b
@@ -2844,7 +2856,21 @@ The result of the intrinsic is a vector consisting of one or more 32-bit registe
x128 128 NA NA
=========== ========================= ========== ==========
-The last argument `i1 %pack` is a compile-time constant which when set, indicates that the adjacent columns are packed into a single 32-bit element during the load
+The last argument `i1 %pack` is a compile-time constant which when set,
+indicates that the adjacent columns are packed into a single 32-bit element
+during the load
+
+`tcgen05.ld.red` contains `%redOp` flag to specify the load reduciton operation and the f32 variant supports
+`%abs` and `%nan` bit flags for abs and nan respectively
+
+`%redOp` flag:
+
+=========== =============
+ value operation
+=========== =============
+ 0 min
+ 1 max
+=========== =============
For more information, refer to the
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 76677d5741eab..8bbf803dcd427 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1143,7 +1143,7 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
[OpType, llvm_i32_ty, llvm_i32_ty]);
}
-class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
+class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num, string ElemType = "i32"> {
int shift = !cond(!eq(Shape, "16x128b"): 1,
!eq(Shape, "16x256b"): 2,
true : 0);
@@ -1151,14 +1151,14 @@ class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
int veclen = !shl(1, !add(Num, shift));
int valid = !le(veclen, 128);
- LLVMType type = !cond(!eq(veclen, 1): llvm_i32_ty,
- !eq(veclen, 2): llvm_v2i32_ty,
- !eq(veclen, 4): llvm_v4i32_ty,
- !eq(veclen, 8): llvm_v8i32_ty,
- !eq(veclen, 16): llvm_v16i32_ty,
- !eq(veclen, 32): llvm_v32i32_ty,
- !eq(veclen, 64): llvm_v64i32_ty,
- !eq(veclen, 128): llvm_v128i32_ty,
+ LLVMType type = !cond(!eq(veclen, 1): LLVMType<!cast<ValueType>(ElemType)>,
+ !eq(veclen, 2): LLVMType<!cast<ValueType>("v"#2#ElemType)>,
+ !eq(veclen, 4): LLVMType<!cast<ValueType>("v"#4#ElemType)>,
+ !eq(veclen, 8): LLVMType<!cast<ValueType>("v"#8#ElemType)>,
+ !eq(veclen, 16): LLVMType<!cast<ValueType>("v"#16#ElemType)>,
+ !eq(veclen, 32): LLVMType<!cast<ValueType>("v"#32#ElemType)>,
+ !eq(veclen, 64): LLVMType<!cast<ValueType>("v"#64#ElemType)>,
+ !eq(veclen, 128): LLVMType<!cast<ValueType>("v"#128#ElemType)>,
true : llvm_void_ty);
}
@@ -3126,6 +3126,16 @@ class NVVM_TCGEN05_LD<string Shape, int Num> :
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
[ImmArg<ArgIndex<1>>]))>;
+// Tcgen05 ld.red intrinsics
+class NVVM_TCGEN05_LD_RED<string Shape, int Num, string RedValTy> :
+ DefaultAttrsIntrinsicFlags<
+ [NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num, RedValTy>.type, LLVMType<!cast<ValueType>(RedValTy)>],
+ [llvm_tmem_ptr_ty],
+ !listconcat(!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
+ [llvm_i32_ty], // RedOp
+ !if(!eq(RedValTy, "f32"), [llvm_i1_ty, llvm_i1_ty], [])), // abs, nan
+ [IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+
// Tcgen05 st intrinsics
class NVVM_TCGEN05_ST<string Shape, int Num> :
Intrinsic<[],
@@ -3138,6 +3148,7 @@ class NVVM_TCGEN05_ST<string Shape, int Num> :
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>],
[ImmArg<ArgIndex<2>>]))>;
+// tcgen05 ld/st intrinsics
foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
foreach num = 0...8 in {
if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
@@ -3149,6 +3160,21 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
}
}
+// tcgen05.ld.red intrinsics
+foreach shape = ["32x32b", "16x32bx2"] in {
+ // num starts with x2 (1 << 1) as tcgen05.ld.red does not support x1
+ foreach num = 1...8 in {
+ foreach ty = ["f32", "i32"] in {
+ if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
+
+ def IntrinsicName<"llvm.nvvm.tcgen05.ld.red." # shape
+ # ".x" # !shl(1, num) # "." # ty>.record_name:
+ NVVM_TCGEN05_LD_RED<shape, num, ty>;
+ } // valid
+ } // ty
+ } // num
+} // shape
+
//
// Bulk store intrinsics
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index dfd9486b971be..23920af9f00c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1092,7 +1092,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom lowering for tcgen05.ld vector operands
setOperationAction(ISD::INTRINSIC_W_CHAIN,
{MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
- MVT::v32i32, MVT::v64i32, MVT::v128i32},
+ MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,
+ MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,
+ MVT::v64f32, MVT::v128f32},
Custom);
// Custom lowering for tcgen05.st vector operands
@@ -3005,6 +3007,121 @@ static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
return getPRMT(A, B, Selector, DL, DAG, Mode);
}
+#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE) \
+ Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE
+
+#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE) \
+ NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE
+
+static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {
+ switch (IID) {
+ case TCGEN05_LD_RED_INTR(32x32b, 2, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 2, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 4, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 4, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 8, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 8, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 16, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 16, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 32, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 32, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 64, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 64, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 128, f32):
+ return TCGEN05_LD_RED_INST(32x32b, 128, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);
+ case TCGEN05_LD_RED_INTR(32x32b, 2, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 2, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 4, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 4, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 8, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 8, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 16, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 16, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 32, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 32, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 64, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 64, I32);
+ case TCGEN05_LD_RED_INTR(32x32b, 128, i32):
+ return TCGEN05_LD_RED_INST(32x32b, 128, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);
+ case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):
+ return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);
+ default:
+ llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");
+ }
+}
+
+// Lower vector return type of tcgen05.ld intrinsics
+static std::optional<std::tuple<SDValue, SDValue, SDValue>>
+lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT ResVT = N->getValueType(0);
+ if (!ResVT.isVector())
+ return {}; // already legalized.
+
+ const unsigned NumElts = ResVT.getVectorNumElements();
+
+ // Create the return type of the instructions
+ // +1 represents the reduction value
+ SmallVector<EVT, 132> ListVTs{
+ NumElts + 1,
+ ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};
+
+ ListVTs.push_back(MVT::Other); // Chain
+
+ SDVTList ResVTs = DAG.getVTList(ListVTs);
+
+ // Prepare the Operands
+ SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain
+
+ // skip IID at index 1
+ for (int i = 2; i < N->getNumOperands(); i++)
+ Ops.push_back(N->getOperand(i));
+
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+ SDValue NewNode =
+ DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+ // Split vector result
+ SmallVector<SDValue, 132> ScalarRes;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewNode.getValue(i);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+ SDValue RedResult = NewNode.getValue(NumElts);
+ SDValue Chain = NewNode.getValue(NumElts + 1);
+ return {{BuildVector, RedResult, Chain}};
+}
+
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) {
switch (Op->getConstantOperandVal(1)) {
default:
@@ -3019,9 +3136,14 @@ static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
return SDValue();
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:
case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
- if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
- return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
+ if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))
+ return DAG.getMergeValues(
+ {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));
return SDValue();
}
}
@@ -5024,7 +5146,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v2i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5034,11 +5158,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v2f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v4i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5048,11 +5185,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v4f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5062,11 +5212,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v8f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v16i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5076,11 +5239,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v16f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v32i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5090,11 +5266,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v32f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v64i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5104,11 +5293,24 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v64f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
- case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
+ case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v128i32;
Info.ptrVal = I.getArgOperand(0);
@@ -5118,6 +5320,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::v128f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOLoad;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
@@ -6862,6 +7075,37 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
Results.push_back(Res->second);
}
return;
+
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:
+ case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:
+ if (auto Res = lowerTcgen05LdRed(N, DAG)) {
+ Results.push_back(std::get<0>(*Res));
+ Results.push_back(std::get<1>(*Res));
+ Results.push_back(std::get<2>(*Res));
+ }
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f310d43f02d8e..ad7031d088c79 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -5763,6 +5763,108 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
} // isConvergent
+//
+// tcgen05.ld.red
+//
+
+class Tcgen05LdRedTypeProfile<int Num, bit hasOffset, ValueType RedValTy>:
+ SDTypeProfile<0, 0, []> {
+
+ list<ValueType> Results = !listsplat(RedValTy, !add(Num, 1)); // ret_type, red_val
+ list<ValueType> VTs = !listconcat(
+ Results,
+ [i32], // ptr
+ !if(!eq(hasOffset, 1), [i64], []), // offset
+ [i32], // redOp
+ !if(!eq(RedValTy, f32), [i1, i1], []) // abs, nan
+ );
+ let Constraints = !foreach(x, !range(!size(VTs)), SDTCisVT<x, VTs[x]>);
+ let NumResults = !size(Results);
+ let NumOperands = !sub(!size(VTs), NumResults);
+}
+
+class Tcgen05LdRedSDNode<string Shape, int Num, string Type>:
+ SDNode<"NVPTXISD::TCGEN05_LD_RED_" # Shape # "_X" # Num # "_" # !toupper(Type),
+ Tcgen05LdRedTypeProfile<Num, !eq(Shape, "16x32bx2"), !cast<ValueType>(Type)>,
+ [SDNPHasChain, SDNPSideEffect, SDNPMemOperand]>;
+
+class Tcgen05LdRedInst<string Shape, int Num, string RedOp, string Type,
+ bit Abs = 0, bit Nan = 0>:
+ NVPTXInst<(outs), (ins), "?", []>,
+ Requires<[callSubtarget<"hasTcgen05LdRedSupport">]> {
+
+ bit IsFloat = !eq(Type, "f32");
+ string TypeStr = !if(IsFloat, "f32", "u32");
+
+ SDNode Opcode = Tcgen05LdRedSDNode<Shape, Num, Type>;
+
+ TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<Num>;
+
+ let InOperandList = !con((ins B32:$addr),
+ !if(!eq(Shape, "16x32bx2"), (ins i64imm:$offset),
+ (ins)));
+
+ let OutOperandList = !con(Info.Outs, (outs B32:$redVal));
+
+ string AbsStr = !if(!and(!eq(IsFloat, 1), !eq(Abs, 1)), ".abs", "");
+ string NanStr = !if(!and(!eq(IsFloat, 1), !eq(Nan, 1)), ".NaN", "");
+
+ let AsmString = "tcgen05.ld.red.sync.aligned"
+ # "." # Shape
+ # ".x" # Num
+ # "." # RedOp
+ # AbsStr # NanStr
+ # "." # TypeStr # " "
+ # Info.regstring # ", $redVal, [$addr]"
+ # !if(!eq(Shape, "16x32bx2"), ", $offset", "")
+ # ";";
+
+ int RedOpVal = !cond(
+ !eq(RedOp, "min") : 0,
+ !eq(RedOp, "max") : 1,
+ );
+
+ int AbsVal = !if(!eq(Abs, 1), -1, 0);
+ int NanVal = !if(!eq(Nan, 1), -1, 0);
+
+ dag IntrinsicPattern = !con((Opcode i32:$addr),
+ !if(!eq(Shape, "16x32bx2"), (Opcode i64:$offset),
+ (Opcode)));
+
+ dag FlagOperands = !con((Opcode (i32 RedOpVal)),
+ !if(!eq(IsFloat, 1), (Opcode (i1 AbsVal), (i1 NanVal)),
+ (Opcode)));
+
+ dag Results = !con(!foreach(tmp, OutOperandList,
+ !subst(outs, set,
+ !subst(B32, !if(!eq(IsFloat, 1), f32, i32), tmp))),
+ // Unfortunately, there is no way to append a dag hence
+ // creating a dummy argument so that it can be replaced with
+ // the intrinsic arg dag
+ (set 0));
+
+ let Pattern = [!setdagarg(Results, !sub(!size(Results), 1),
+ !con(IntrinsicPattern, FlagOperands))];
+}
+
+let isConvergent = true in {
+ foreach shape = ["32x32b", "16x32bx2"] in {
+ foreach num = 1...8 in {
+ foreach redop = ["min", "max"] in {
+ defvar access_size = NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>;
+ if access_size.valid then {
+ foreach abs = [0, 1] in {
+ foreach nan = [0, 1] in {
+ def : Tcgen05LdRedInst<shape, access_size.veclen, redop, "f32", abs, nan>;
+ } // nan
+ } // abs
+ def : Tcgen05LdRedInst<shape, access_size.veclen, redop, "i32">;
+ } // valid
+ } // redop
+ } // num
+ } // shape
+} // isConvergent
+
// Bulk store instructions
def st_bulk_imm : TImmLeaf<i64, [{ return Imm == 0; }]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 5f426bf1a15f9..98926e048b921 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -162,6 +162,11 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
hasPTXWithAccelSMs(86, {100, 101, 103});
}
+ bool hasTcgen05LdRedSupport() const {
+ return hasPTXWithFamilySMs(90, {110, 103}) ||
+ hasPTXWithFamilySMs(88, {101, 103});
+ }
+
bool hasReduxSyncF32() const {
return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100});
}
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld-red.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld-red.ll
new file mode 100644
index 0000000000000..229d30d9c640c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld-red.ll
@@ -0,0 +1,389 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_103f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
+; RUN: %if ptxas-sm_101a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %}
+; RUN: %if ptxas-sm_103f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103f | %ptxas-verify -arch=sm_103f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
+
+define void @nvvm_tcgen05_ld_32x32b_min_i32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_32x32b_min_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<263>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_min_i32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.min.u32 {%r2, %r3}, %r4, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.min.u32 {%r5, %r6, %r7, %r8}, %r9, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.min.u32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.min.u32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.min.u32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.min.u32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.min.u32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1];
+; CHECK-NEXT: ret;
+ tail call {<2 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<4 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<8 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<16 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<32 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<64 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %taddr, i32 0)
+ tail call {<128 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %taddr, i32 0)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_32x32b_max_i32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_32x32b_max_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<263>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_max_i32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.max.u32 {%r2, %r3}, %r4, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.max.u32 {%r5, %r6, %r7, %r8}, %r9, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.max.u32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.max.u32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.max.u32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.max.u32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.max.u32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1];
+; CHECK-NEXT: ret;
+ tail call {<2 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<4 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<8 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<16 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<32 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<64 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %taddr, i32 1)
+ tail call {<128 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %taddr, i32 1)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_16x32bx2_min_i32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_16x32bx2_min_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<263>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_min_i32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.min.u32 {%r2, %r3}, %r4, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.min.u32 {%r5, %r6, %r7, %r8}, %r9, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.min.u32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.min.u32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.min.u32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.min.u32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.min.u32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1], 0;
+; CHECK-NEXT: ret;
+ tail call {<2 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<4 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<8 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<16 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<32 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<64 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ tail call {<128 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.i32(ptr addrspace(6) %taddr, i64 0, i32 0)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_16x32bx2_max_i32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_16x32bx2_max_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<263>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_max_i32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.max.u32 {%r2, %r3}, %r4, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.max.u32 {%r5, %r6, %r7, %r8}, %r9, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.max.u32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.max.u32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.max.u32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.max.u32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.max.u32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1], 0;
+; CHECK-NEXT: ret;
+ tail call {<2 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<4 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<8 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<16 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<32 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<64 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ tail call {<128 x i32>, i32} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.i32(ptr addrspace(6) %taddr, i64 0, i32 1)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_32x32b_min_f32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_32x32b_min_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<1046>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_min_f32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.min.f32 {%r2, %r3}, %r4, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.min.f32 {%r5, %r6, %r7, %r8}, %r9, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.min.f32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.min.f32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.min.f32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.min.f32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.min.f32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.min.abs.f32 {%r263, %r264}, %r265, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.min.abs.f32 {%r266, %r267, %r268, %r269}, %r270, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.min.abs.f32 {%r271, %r272, %r273, %r274, %r275, %r276, %r277, %r278}, %r279, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.min.abs.f32 {%r280, %r281, %r282, %r283, %r284, %r285, %r286, %r287, %r288, %r289, %r290, %r291, %r292, %r293, %r294, %r295}, %r296, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.min.abs.f32 {%r297, %r298, %r299, %r300, %r301, %r302, %r303, %r304, %r305, %r306, %r307, %r308, %r309, %r310, %r311, %r312, %r313, %r314, %r315, %r316, %r317, %r318, %r319, %r320, %r321, %r322, %r323, %r324, %r325, %r326, %r327, %r328}, %r329, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.min.abs.f32 {%r330, %r331, %r332, %r333, %r334, %r335, %r336, %r337, %r338, %r339, %r340, %r341, %r342, %r343, %r344, %r345, %r346, %r347, %r348, %r349, %r350, %r351, %r352, %r353, %r354, %r355, %r356, %r357, %r358, %r359, %r360, %r361, %r362, %r363, %r364, %r365, %r366, %r367, %r368, %r369, %r370, %r371, %r372, %r373, %r374, %r375, %r376, %r377, %r378, %r379, %r380, %r381, %r382, %r383, %r384, %r385, %r386, %r387, %r388, %r389, %r390, %r391, %r392, %r393}, %r394, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.min.abs.f32 {%r395, %r396, %r397, %r398, %r399, %r400, %r401, %r402, %r403, %r404, %r405, %r406, %r407, %r408, %r409, %r410, %r411, %r412, %r413, %r414, %r415, %r416, %r417, %r418, %r419, %r420, %r421, %r422, %r423, %r424, %r425, %r426, %r427, %r428, %r429, %r430, %r431, %r432, %r433, %r434, %r435, %r436, %r437, %r438, %r439, %r440, %r441, %r442, %r443, %r444, %r445, %r446, %r447, %r448, %r449, %r450, %r451, %r452, %r453, %r454, %r455, %r456, %r457, %r458, %r459, %r460, %r461, %r462, %r463, %r464, %r465, %r466, %r467, %r468, %r469, %r470, %r471, %r472, %r473, %r474, %r475, %r476, %r477, %r478, %r479, %r480, %r481, %r482, %r483, %r484, %r485, %r486, %r487, %r488, %r489, %r490, %r491, %r492, %r493, %r494, %r495, %r496, %r497, %r498, %r499, %r500, %r501, %r502, %r503, %r504, %r505, %r506, %r507, %r508, %r509, %r510, %r511, %r512, %r513, %r514, %r515, %r516, %r517, %r518, %r519, %r520, %r521, %r522}, %r523, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.min.NaN.f32 {%r524, %r525}, %r526, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.min.NaN.f32 {%r527, %r528, %r529, %r530}, %r531, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.min.NaN.f32 {%r532, %r533, %r534, %r535, %r536, %r537, %r538, %r539}, %r540, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.min.NaN.f32 {%r541, %r542, %r543, %r544, %r545, %r546, %r547, %r548, %r549, %r550, %r551, %r552, %r553, %r554, %r555, %r556}, %r557, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.min.NaN.f32 {%r558, %r559, %r560, %r561, %r562, %r563, %r564, %r565, %r566, %r567, %r568, %r569, %r570, %r571, %r572, %r573, %r574, %r575, %r576, %r577, %r578, %r579, %r580, %r581, %r582, %r583, %r584, %r585, %r586, %r587, %r588, %r589}, %r590, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.min.NaN.f32 {%r591, %r592, %r593, %r594, %r595, %r596, %r597, %r598, %r599, %r600, %r601, %r602, %r603, %r604, %r605, %r606, %r607, %r608, %r609, %r610, %r611, %r612, %r613, %r614, %r615, %r616, %r617, %r618, %r619, %r620, %r621, %r622, %r623, %r624, %r625, %r626, %r627, %r628, %r629, %r630, %r631, %r632, %r633, %r634, %r635, %r636, %r637, %r638, %r639, %r640, %r641, %r642, %r643, %r644, %r645, %r646, %r647, %r648, %r649, %r650, %r651, %r652, %r653, %r654}, %r655, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.min.NaN.f32 {%r656, %r657, %r658, %r659, %r660, %r661, %r662, %r663, %r664, %r665, %r666, %r667, %r668, %r669, %r670, %r671, %r672, %r673, %r674, %r675, %r676, %r677, %r678, %r679, %r680, %r681, %r682, %r683, %r684, %r685, %r686, %r687, %r688, %r689, %r690, %r691, %r692, %r693, %r694, %r695, %r696, %r697, %r698, %r699, %r700, %r701, %r702, %r703, %r704, %r705, %r706, %r707, %r708, %r709, %r710, %r711, %r712, %r713, %r714, %r715, %r716, %r717, %r718, %r719, %r720, %r721, %r722, %r723, %r724, %r725, %r726, %r727, %r728, %r729, %r730, %r731, %r732, %r733, %r734, %r735, %r736, %r737, %r738, %r739, %r740, %r741, %r742, %r743, %r744, %r745, %r746, %r747, %r748, %r749, %r750, %r751, %r752, %r753, %r754, %r755, %r756, %r757, %r758, %r759, %r760, %r761, %r762, %r763, %r764, %r765, %r766, %r767, %r768, %r769, %r770, %r771, %r772, %r773, %r774, %r775, %r776, %r777, %r778, %r779, %r780, %r781, %r782, %r783}, %r784, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.min.abs.NaN.f32 {%r785, %r786}, %r787, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.min.abs.NaN.f32 {%r788, %r789, %r790, %r791}, %r792, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.min.abs.NaN.f32 {%r793, %r794, %r795, %r796, %r797, %r798, %r799, %r800}, %r801, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.min.abs.NaN.f32 {%r802, %r803, %r804, %r805, %r806, %r807, %r808, %r809, %r810, %r811, %r812, %r813, %r814, %r815, %r816, %r817}, %r818, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.min.abs.NaN.f32 {%r819, %r820, %r821, %r822, %r823, %r824, %r825, %r826, %r827, %r828, %r829, %r830, %r831, %r832, %r833, %r834, %r835, %r836, %r837, %r838, %r839, %r840, %r841, %r842, %r843, %r844, %r845, %r846, %r847, %r848, %r849, %r850}, %r851, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.min.abs.NaN.f32 {%r852, %r853, %r854, %r855, %r856, %r857, %r858, %r859, %r860, %r861, %r862, %r863, %r864, %r865, %r866, %r867, %r868, %r869, %r870, %r871, %r872, %r873, %r874, %r875, %r876, %r877, %r878, %r879, %r880, %r881, %r882, %r883, %r884, %r885, %r886, %r887, %r888, %r889, %r890, %r891, %r892, %r893, %r894, %r895, %r896, %r897, %r898, %r899, %r900, %r901, %r902, %r903, %r904, %r905, %r906, %r907, %r908, %r909, %r910, %r911, %r912, %r913, %r914, %r915}, %r916, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.min.abs.NaN.f32 {%r917, %r918, %r919, %r920, %r921, %r922, %r923, %r924, %r925, %r926, %r927, %r928, %r929, %r930, %r931, %r932, %r933, %r934, %r935, %r936, %r937, %r938, %r939, %r940, %r941, %r942, %r943, %r944, %r945, %r946, %r947, %r948, %r949, %r950, %r951, %r952, %r953, %r954, %r955, %r956, %r957, %r958, %r959, %r960, %r961, %r962, %r963, %r964, %r965, %r966, %r967, %r968, %r969, %r970, %r971, %r972, %r973, %r974, %r975, %r976, %r977, %r978, %r979, %r980, %r981, %r982, %r983, %r984, %r985, %r986, %r987, %r988, %r989, %r990, %r991, %r992, %r993, %r994, %r995, %r996, %r997, %r998, %r999, %r1000, %r1001, %r1002, %r1003, %r1004, %r1005, %r1006, %r1007, %r1008, %r1009, %r1010, %r1011, %r1012, %r1013, %r1014, %r1015, %r1016, %r1017, %r1018, %r1019, %r1020, %r1021, %r1022, %r1023, %r1024, %r1025, %r1026, %r1027, %r1028, %r1029, %r1030, %r1031, %r1032, %r1033, %r1034, %r1035, %r1036, %r1037, %r1038, %r1039, %r1040, %r1041, %r1042, %r1043, %r1044}, %r1045, [%r1];
+; CHECK-NEXT: ret;
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 0, i1 0, i1 1)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 0, i1 1, i1 1)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_32x32b_max_f32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_32x32b_max_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<1046>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_max_f32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.max.f32 {%r2, %r3}, %r4, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.max.f32 {%r5, %r6, %r7, %r8}, %r9, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.max.f32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.max.f32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.max.f32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.max.f32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.max.f32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.max.abs.f32 {%r263, %r264}, %r265, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.max.abs.f32 {%r266, %r267, %r268, %r269}, %r270, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.max.abs.f32 {%r271, %r272, %r273, %r274, %r275, %r276, %r277, %r278}, %r279, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.max.abs.f32 {%r280, %r281, %r282, %r283, %r284, %r285, %r286, %r287, %r288, %r289, %r290, %r291, %r292, %r293, %r294, %r295}, %r296, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.max.abs.f32 {%r297, %r298, %r299, %r300, %r301, %r302, %r303, %r304, %r305, %r306, %r307, %r308, %r309, %r310, %r311, %r312, %r313, %r314, %r315, %r316, %r317, %r318, %r319, %r320, %r321, %r322, %r323, %r324, %r325, %r326, %r327, %r328}, %r329, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.max.abs.f32 {%r330, %r331, %r332, %r333, %r334, %r335, %r336, %r337, %r338, %r339, %r340, %r341, %r342, %r343, %r344, %r345, %r346, %r347, %r348, %r349, %r350, %r351, %r352, %r353, %r354, %r355, %r356, %r357, %r358, %r359, %r360, %r361, %r362, %r363, %r364, %r365, %r366, %r367, %r368, %r369, %r370, %r371, %r372, %r373, %r374, %r375, %r376, %r377, %r378, %r379, %r380, %r381, %r382, %r383, %r384, %r385, %r386, %r387, %r388, %r389, %r390, %r391, %r392, %r393}, %r394, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.max.abs.f32 {%r395, %r396, %r397, %r398, %r399, %r400, %r401, %r402, %r403, %r404, %r405, %r406, %r407, %r408, %r409, %r410, %r411, %r412, %r413, %r414, %r415, %r416, %r417, %r418, %r419, %r420, %r421, %r422, %r423, %r424, %r425, %r426, %r427, %r428, %r429, %r430, %r431, %r432, %r433, %r434, %r435, %r436, %r437, %r438, %r439, %r440, %r441, %r442, %r443, %r444, %r445, %r446, %r447, %r448, %r449, %r450, %r451, %r452, %r453, %r454, %r455, %r456, %r457, %r458, %r459, %r460, %r461, %r462, %r463, %r464, %r465, %r466, %r467, %r468, %r469, %r470, %r471, %r472, %r473, %r474, %r475, %r476, %r477, %r478, %r479, %r480, %r481, %r482, %r483, %r484, %r485, %r486, %r487, %r488, %r489, %r490, %r491, %r492, %r493, %r494, %r495, %r496, %r497, %r498, %r499, %r500, %r501, %r502, %r503, %r504, %r505, %r506, %r507, %r508, %r509, %r510, %r511, %r512, %r513, %r514, %r515, %r516, %r517, %r518, %r519, %r520, %r521, %r522}, %r523, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.max.NaN.f32 {%r524, %r525}, %r526, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.max.NaN.f32 {%r527, %r528, %r529, %r530}, %r531, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.max.NaN.f32 {%r532, %r533, %r534, %r535, %r536, %r537, %r538, %r539}, %r540, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.max.NaN.f32 {%r541, %r542, %r543, %r544, %r545, %r546, %r547, %r548, %r549, %r550, %r551, %r552, %r553, %r554, %r555, %r556}, %r557, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.max.NaN.f32 {%r558, %r559, %r560, %r561, %r562, %r563, %r564, %r565, %r566, %r567, %r568, %r569, %r570, %r571, %r572, %r573, %r574, %r575, %r576, %r577, %r578, %r579, %r580, %r581, %r582, %r583, %r584, %r585, %r586, %r587, %r588, %r589}, %r590, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.max.NaN.f32 {%r591, %r592, %r593, %r594, %r595, %r596, %r597, %r598, %r599, %r600, %r601, %r602, %r603, %r604, %r605, %r606, %r607, %r608, %r609, %r610, %r611, %r612, %r613, %r614, %r615, %r616, %r617, %r618, %r619, %r620, %r621, %r622, %r623, %r624, %r625, %r626, %r627, %r628, %r629, %r630, %r631, %r632, %r633, %r634, %r635, %r636, %r637, %r638, %r639, %r640, %r641, %r642, %r643, %r644, %r645, %r646, %r647, %r648, %r649, %r650, %r651, %r652, %r653, %r654}, %r655, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.max.NaN.f32 {%r656, %r657, %r658, %r659, %r660, %r661, %r662, %r663, %r664, %r665, %r666, %r667, %r668, %r669, %r670, %r671, %r672, %r673, %r674, %r675, %r676, %r677, %r678, %r679, %r680, %r681, %r682, %r683, %r684, %r685, %r686, %r687, %r688, %r689, %r690, %r691, %r692, %r693, %r694, %r695, %r696, %r697, %r698, %r699, %r700, %r701, %r702, %r703, %r704, %r705, %r706, %r707, %r708, %r709, %r710, %r711, %r712, %r713, %r714, %r715, %r716, %r717, %r718, %r719, %r720, %r721, %r722, %r723, %r724, %r725, %r726, %r727, %r728, %r729, %r730, %r731, %r732, %r733, %r734, %r735, %r736, %r737, %r738, %r739, %r740, %r741, %r742, %r743, %r744, %r745, %r746, %r747, %r748, %r749, %r750, %r751, %r752, %r753, %r754, %r755, %r756, %r757, %r758, %r759, %r760, %r761, %r762, %r763, %r764, %r765, %r766, %r767, %r768, %r769, %r770, %r771, %r772, %r773, %r774, %r775, %r776, %r777, %r778, %r779, %r780, %r781, %r782, %r783}, %r784, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x2.max.abs.NaN.f32 {%r785, %r786}, %r787, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x4.max.abs.NaN.f32 {%r788, %r789, %r790, %r791}, %r792, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x8.max.abs.NaN.f32 {%r793, %r794, %r795, %r796, %r797, %r798, %r799, %r800}, %r801, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x16.max.abs.NaN.f32 {%r802, %r803, %r804, %r805, %r806, %r807, %r808, %r809, %r810, %r811, %r812, %r813, %r814, %r815, %r816, %r817}, %r818, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x32.max.abs.NaN.f32 {%r819, %r820, %r821, %r822, %r823, %r824, %r825, %r826, %r827, %r828, %r829, %r830, %r831, %r832, %r833, %r834, %r835, %r836, %r837, %r838, %r839, %r840, %r841, %r842, %r843, %r844, %r845, %r846, %r847, %r848, %r849, %r850}, %r851, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x64.max.abs.NaN.f32 {%r852, %r853, %r854, %r855, %r856, %r857, %r858, %r859, %r860, %r861, %r862, %r863, %r864, %r865, %r866, %r867, %r868, %r869, %r870, %r871, %r872, %r873, %r874, %r875, %r876, %r877, %r878, %r879, %r880, %r881, %r882, %r883, %r884, %r885, %r886, %r887, %r888, %r889, %r890, %r891, %r892, %r893, %r894, %r895, %r896, %r897, %r898, %r899, %r900, %r901, %r902, %r903, %r904, %r905, %r906, %r907, %r908, %r909, %r910, %r911, %r912, %r913, %r914, %r915}, %r916, [%r1];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.32x32b.x128.max.abs.NaN.f32 {%r917, %r918, %r919, %r920, %r921, %r922, %r923, %r924, %r925, %r926, %r927, %r928, %r929, %r930, %r931, %r932, %r933, %r934, %r935, %r936, %r937, %r938, %r939, %r940, %r941, %r942, %r943, %r944, %r945, %r946, %r947, %r948, %r949, %r950, %r951, %r952, %r953, %r954, %r955, %r956, %r957, %r958, %r959, %r960, %r961, %r962, %r963, %r964, %r965, %r966, %r967, %r968, %r969, %r970, %r971, %r972, %r973, %r974, %r975, %r976, %r977, %r978, %r979, %r980, %r981, %r982, %r983, %r984, %r985, %r986, %r987, %r988, %r989, %r990, %r991, %r992, %r993, %r994, %r995, %r996, %r997, %r998, %r999, %r1000, %r1001, %r1002, %r1003, %r1004, %r1005, %r1006, %r1007, %r1008, %r1009, %r1010, %r1011, %r1012, %r1013, %r1014, %r1015, %r1016, %r1017, %r1018, %r1019, %r1020, %r1021, %r1022, %r1023, %r1024, %r1025, %r1026, %r1027, %r1028, %r1029, %r1030, %r1031, %r1032, %r1033, %r1034, %r1035, %r1036, %r1037, %r1038, %r1039, %r1040, %r1041, %r1042, %r1043, %r1044}, %r1045, [%r1];
+; CHECK-NEXT: ret;
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 1, i1 0, i1 1)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %taddr, i32 1, i1 1, i1 1)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_16x32bx2_min_f32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_16x32bx2_min_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<1046>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_min_f32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.min.f32 {%r2, %r3}, %r4, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.min.f32 {%r5, %r6, %r7, %r8}, %r9, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.min.f32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.min.f32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.min.f32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.min.f32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.min.f32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.min.abs.f32 {%r263, %r264}, %r265, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.min.abs.f32 {%r266, %r267, %r268, %r269}, %r270, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.min.abs.f32 {%r271, %r272, %r273, %r274, %r275, %r276, %r277, %r278}, %r279, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.min.abs.f32 {%r280, %r281, %r282, %r283, %r284, %r285, %r286, %r287, %r288, %r289, %r290, %r291, %r292, %r293, %r294, %r295}, %r296, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.min.abs.f32 {%r297, %r298, %r299, %r300, %r301, %r302, %r303, %r304, %r305, %r306, %r307, %r308, %r309, %r310, %r311, %r312, %r313, %r314, %r315, %r316, %r317, %r318, %r319, %r320, %r321, %r322, %r323, %r324, %r325, %r326, %r327, %r328}, %r329, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.min.abs.f32 {%r330, %r331, %r332, %r333, %r334, %r335, %r336, %r337, %r338, %r339, %r340, %r341, %r342, %r343, %r344, %r345, %r346, %r347, %r348, %r349, %r350, %r351, %r352, %r353, %r354, %r355, %r356, %r357, %r358, %r359, %r360, %r361, %r362, %r363, %r364, %r365, %r366, %r367, %r368, %r369, %r370, %r371, %r372, %r373, %r374, %r375, %r376, %r377, %r378, %r379, %r380, %r381, %r382, %r383, %r384, %r385, %r386, %r387, %r388, %r389, %r390, %r391, %r392, %r393}, %r394, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.min.abs.f32 {%r395, %r396, %r397, %r398, %r399, %r400, %r401, %r402, %r403, %r404, %r405, %r406, %r407, %r408, %r409, %r410, %r411, %r412, %r413, %r414, %r415, %r416, %r417, %r418, %r419, %r420, %r421, %r422, %r423, %r424, %r425, %r426, %r427, %r428, %r429, %r430, %r431, %r432, %r433, %r434, %r435, %r436, %r437, %r438, %r439, %r440, %r441, %r442, %r443, %r444, %r445, %r446, %r447, %r448, %r449, %r450, %r451, %r452, %r453, %r454, %r455, %r456, %r457, %r458, %r459, %r460, %r461, %r462, %r463, %r464, %r465, %r466, %r467, %r468, %r469, %r470, %r471, %r472, %r473, %r474, %r475, %r476, %r477, %r478, %r479, %r480, %r481, %r482, %r483, %r484, %r485, %r486, %r487, %r488, %r489, %r490, %r491, %r492, %r493, %r494, %r495, %r496, %r497, %r498, %r499, %r500, %r501, %r502, %r503, %r504, %r505, %r506, %r507, %r508, %r509, %r510, %r511, %r512, %r513, %r514, %r515, %r516, %r517, %r518, %r519, %r520, %r521, %r522}, %r523, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.min.NaN.f32 {%r524, %r525}, %r526, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.min.NaN.f32 {%r527, %r528, %r529, %r530}, %r531, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.min.NaN.f32 {%r532, %r533, %r534, %r535, %r536, %r537, %r538, %r539}, %r540, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.min.NaN.f32 {%r541, %r542, %r543, %r544, %r545, %r546, %r547, %r548, %r549, %r550, %r551, %r552, %r553, %r554, %r555, %r556}, %r557, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.min.NaN.f32 {%r558, %r559, %r560, %r561, %r562, %r563, %r564, %r565, %r566, %r567, %r568, %r569, %r570, %r571, %r572, %r573, %r574, %r575, %r576, %r577, %r578, %r579, %r580, %r581, %r582, %r583, %r584, %r585, %r586, %r587, %r588, %r589}, %r590, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.min.NaN.f32 {%r591, %r592, %r593, %r594, %r595, %r596, %r597, %r598, %r599, %r600, %r601, %r602, %r603, %r604, %r605, %r606, %r607, %r608, %r609, %r610, %r611, %r612, %r613, %r614, %r615, %r616, %r617, %r618, %r619, %r620, %r621, %r622, %r623, %r624, %r625, %r626, %r627, %r628, %r629, %r630, %r631, %r632, %r633, %r634, %r635, %r636, %r637, %r638, %r639, %r640, %r641, %r642, %r643, %r644, %r645, %r646, %r647, %r648, %r649, %r650, %r651, %r652, %r653, %r654}, %r655, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.min.NaN.f32 {%r656, %r657, %r658, %r659, %r660, %r661, %r662, %r663, %r664, %r665, %r666, %r667, %r668, %r669, %r670, %r671, %r672, %r673, %r674, %r675, %r676, %r677, %r678, %r679, %r680, %r681, %r682, %r683, %r684, %r685, %r686, %r687, %r688, %r689, %r690, %r691, %r692, %r693, %r694, %r695, %r696, %r697, %r698, %r699, %r700, %r701, %r702, %r703, %r704, %r705, %r706, %r707, %r708, %r709, %r710, %r711, %r712, %r713, %r714, %r715, %r716, %r717, %r718, %r719, %r720, %r721, %r722, %r723, %r724, %r725, %r726, %r727, %r728, %r729, %r730, %r731, %r732, %r733, %r734, %r735, %r736, %r737, %r738, %r739, %r740, %r741, %r742, %r743, %r744, %r745, %r746, %r747, %r748, %r749, %r750, %r751, %r752, %r753, %r754, %r755, %r756, %r757, %r758, %r759, %r760, %r761, %r762, %r763, %r764, %r765, %r766, %r767, %r768, %r769, %r770, %r771, %r772, %r773, %r774, %r775, %r776, %r777, %r778, %r779, %r780, %r781, %r782, %r783}, %r784, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.min.abs.NaN.f32 {%r785, %r786}, %r787, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.min.abs.NaN.f32 {%r788, %r789, %r790, %r791}, %r792, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.min.abs.NaN.f32 {%r793, %r794, %r795, %r796, %r797, %r798, %r799, %r800}, %r801, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.min.abs.NaN.f32 {%r802, %r803, %r804, %r805, %r806, %r807, %r808, %r809, %r810, %r811, %r812, %r813, %r814, %r815, %r816, %r817}, %r818, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.min.abs.NaN.f32 {%r819, %r820, %r821, %r822, %r823, %r824, %r825, %r826, %r827, %r828, %r829, %r830, %r831, %r832, %r833, %r834, %r835, %r836, %r837, %r838, %r839, %r840, %r841, %r842, %r843, %r844, %r845, %r846, %r847, %r848, %r849, %r850}, %r851, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.min.abs.NaN.f32 {%r852, %r853, %r854, %r855, %r856, %r857, %r858, %r859, %r860, %r861, %r862, %r863, %r864, %r865, %r866, %r867, %r868, %r869, %r870, %r871, %r872, %r873, %r874, %r875, %r876, %r877, %r878, %r879, %r880, %r881, %r882, %r883, %r884, %r885, %r886, %r887, %r888, %r889, %r890, %r891, %r892, %r893, %r894, %r895, %r896, %r897, %r898, %r899, %r900, %r901, %r902, %r903, %r904, %r905, %r906, %r907, %r908, %r909, %r910, %r911, %r912, %r913, %r914, %r915}, %r916, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.min.abs.NaN.f32 {%r917, %r918, %r919, %r920, %r921, %r922, %r923, %r924, %r925, %r926, %r927, %r928, %r929, %r930, %r931, %r932, %r933, %r934, %r935, %r936, %r937, %r938, %r939, %r940, %r941, %r942, %r943, %r944, %r945, %r946, %r947, %r948, %r949, %r950, %r951, %r952, %r953, %r954, %r955, %r956, %r957, %r958, %r959, %r960, %r961, %r962, %r963, %r964, %r965, %r966, %r967, %r968, %r969, %r970, %r971, %r972, %r973, %r974, %r975, %r976, %r977, %r978, %r979, %r980, %r981, %r982, %r983, %r984, %r985, %r986, %r987, %r988, %r989, %r990, %r991, %r992, %r993, %r994, %r995, %r996, %r997, %r998, %r999, %r1000, %r1001, %r1002, %r1003, %r1004, %r1005, %r1006, %r1007, %r1008, %r1009, %r1010, %r1011, %r1012, %r1013, %r1014, %r1015, %r1016, %r1017, %r1018, %r1019, %r1020, %r1021, %r1022, %r1023, %r1024, %r1025, %r1026, %r1027, %r1028, %r1029, %r1030, %r1031, %r1032, %r1033, %r1034, %r1035, %r1036, %r1037, %r1038, %r1039, %r1040, %r1041, %r1042, %r1043, %r1044}, %r1045, [%r1], 0;
+; CHECK-NEXT: ret;
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 0, i1 1)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 0, i1 1, i1 1)
+ ret void
+}
+
+define void @nvvm_tcgen05_ld_16x32bx2_max_f32(ptr addrspace(6) %taddr) {
+; CHECK-LABEL: nvvm_tcgen05_ld_16x32bx2_max_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<1046>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_max_f32_param_0];
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.max.f32 {%r2, %r3}, %r4, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.max.f32 {%r5, %r6, %r7, %r8}, %r9, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.max.f32 {%r10, %r11, %r12, %r13, %r14, %r15, %r16, %r17}, %r18, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.max.f32 {%r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29, %r30, %r31, %r32, %r33, %r34}, %r35, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.max.f32 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43, %r44, %r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52, %r53, %r54, %r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62, %r63, %r64, %r65, %r66, %r67}, %r68, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.max.f32 {%r69, %r70, %r71, %r72, %r73, %r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81, %r82, %r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90, %r91, %r92, %r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100, %r101, %r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109, %r110, %r111, %r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119, %r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127, %r128, %r129, %r130, %r131, %r132}, %r133, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.max.f32 {%r134, %r135, %r136, %r137, %r138, %r139, %r140, %r141, %r142, %r143, %r144, %r145, %r146, %r147, %r148, %r149, %r150, %r151, %r152, %r153, %r154, %r155, %r156, %r157, %r158, %r159, %r160, %r161, %r162, %r163, %r164, %r165, %r166, %r167, %r168, %r169, %r170, %r171, %r172, %r173, %r174, %r175, %r176, %r177, %r178, %r179, %r180, %r181, %r182, %r183, %r184, %r185, %r186, %r187, %r188, %r189, %r190, %r191, %r192, %r193, %r194, %r195, %r196, %r197, %r198, %r199, %r200, %r201, %r202, %r203, %r204, %r205, %r206, %r207, %r208, %r209, %r210, %r211, %r212, %r213, %r214, %r215, %r216, %r217, %r218, %r219, %r220, %r221, %r222, %r223, %r224, %r225, %r226, %r227, %r228, %r229, %r230, %r231, %r232, %r233, %r234, %r235, %r236, %r237, %r238, %r239, %r240, %r241, %r242, %r243, %r244, %r245, %r246, %r247, %r248, %r249, %r250, %r251, %r252, %r253, %r254, %r255, %r256, %r257, %r258, %r259, %r260, %r261}, %r262, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.max.abs.f32 {%r263, %r264}, %r265, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.max.abs.f32 {%r266, %r267, %r268, %r269}, %r270, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.max.abs.f32 {%r271, %r272, %r273, %r274, %r275, %r276, %r277, %r278}, %r279, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.max.abs.f32 {%r280, %r281, %r282, %r283, %r284, %r285, %r286, %r287, %r288, %r289, %r290, %r291, %r292, %r293, %r294, %r295}, %r296, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.max.abs.f32 {%r297, %r298, %r299, %r300, %r301, %r302, %r303, %r304, %r305, %r306, %r307, %r308, %r309, %r310, %r311, %r312, %r313, %r314, %r315, %r316, %r317, %r318, %r319, %r320, %r321, %r322, %r323, %r324, %r325, %r326, %r327, %r328}, %r329, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.max.abs.f32 {%r330, %r331, %r332, %r333, %r334, %r335, %r336, %r337, %r338, %r339, %r340, %r341, %r342, %r343, %r344, %r345, %r346, %r347, %r348, %r349, %r350, %r351, %r352, %r353, %r354, %r355, %r356, %r357, %r358, %r359, %r360, %r361, %r362, %r363, %r364, %r365, %r366, %r367, %r368, %r369, %r370, %r371, %r372, %r373, %r374, %r375, %r376, %r377, %r378, %r379, %r380, %r381, %r382, %r383, %r384, %r385, %r386, %r387, %r388, %r389, %r390, %r391, %r392, %r393}, %r394, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.max.abs.f32 {%r395, %r396, %r397, %r398, %r399, %r400, %r401, %r402, %r403, %r404, %r405, %r406, %r407, %r408, %r409, %r410, %r411, %r412, %r413, %r414, %r415, %r416, %r417, %r418, %r419, %r420, %r421, %r422, %r423, %r424, %r425, %r426, %r427, %r428, %r429, %r430, %r431, %r432, %r433, %r434, %r435, %r436, %r437, %r438, %r439, %r440, %r441, %r442, %r443, %r444, %r445, %r446, %r447, %r448, %r449, %r450, %r451, %r452, %r453, %r454, %r455, %r456, %r457, %r458, %r459, %r460, %r461, %r462, %r463, %r464, %r465, %r466, %r467, %r468, %r469, %r470, %r471, %r472, %r473, %r474, %r475, %r476, %r477, %r478, %r479, %r480, %r481, %r482, %r483, %r484, %r485, %r486, %r487, %r488, %r489, %r490, %r491, %r492, %r493, %r494, %r495, %r496, %r497, %r498, %r499, %r500, %r501, %r502, %r503, %r504, %r505, %r506, %r507, %r508, %r509, %r510, %r511, %r512, %r513, %r514, %r515, %r516, %r517, %r518, %r519, %r520, %r521, %r522}, %r523, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.max.NaN.f32 {%r524, %r525}, %r526, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.max.NaN.f32 {%r527, %r528, %r529, %r530}, %r531, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.max.NaN.f32 {%r532, %r533, %r534, %r535, %r536, %r537, %r538, %r539}, %r540, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.max.NaN.f32 {%r541, %r542, %r543, %r544, %r545, %r546, %r547, %r548, %r549, %r550, %r551, %r552, %r553, %r554, %r555, %r556}, %r557, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.max.NaN.f32 {%r558, %r559, %r560, %r561, %r562, %r563, %r564, %r565, %r566, %r567, %r568, %r569, %r570, %r571, %r572, %r573, %r574, %r575, %r576, %r577, %r578, %r579, %r580, %r581, %r582, %r583, %r584, %r585, %r586, %r587, %r588, %r589}, %r590, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.max.NaN.f32 {%r591, %r592, %r593, %r594, %r595, %r596, %r597, %r598, %r599, %r600, %r601, %r602, %r603, %r604, %r605, %r606, %r607, %r608, %r609, %r610, %r611, %r612, %r613, %r614, %r615, %r616, %r617, %r618, %r619, %r620, %r621, %r622, %r623, %r624, %r625, %r626, %r627, %r628, %r629, %r630, %r631, %r632, %r633, %r634, %r635, %r636, %r637, %r638, %r639, %r640, %r641, %r642, %r643, %r644, %r645, %r646, %r647, %r648, %r649, %r650, %r651, %r652, %r653, %r654}, %r655, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.max.NaN.f32 {%r656, %r657, %r658, %r659, %r660, %r661, %r662, %r663, %r664, %r665, %r666, %r667, %r668, %r669, %r670, %r671, %r672, %r673, %r674, %r675, %r676, %r677, %r678, %r679, %r680, %r681, %r682, %r683, %r684, %r685, %r686, %r687, %r688, %r689, %r690, %r691, %r692, %r693, %r694, %r695, %r696, %r697, %r698, %r699, %r700, %r701, %r702, %r703, %r704, %r705, %r706, %r707, %r708, %r709, %r710, %r711, %r712, %r713, %r714, %r715, %r716, %r717, %r718, %r719, %r720, %r721, %r722, %r723, %r724, %r725, %r726, %r727, %r728, %r729, %r730, %r731, %r732, %r733, %r734, %r735, %r736, %r737, %r738, %r739, %r740, %r741, %r742, %r743, %r744, %r745, %r746, %r747, %r748, %r749, %r750, %r751, %r752, %r753, %r754, %r755, %r756, %r757, %r758, %r759, %r760, %r761, %r762, %r763, %r764, %r765, %r766, %r767, %r768, %r769, %r770, %r771, %r772, %r773, %r774, %r775, %r776, %r777, %r778, %r779, %r780, %r781, %r782, %r783}, %r784, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x2.max.abs.NaN.f32 {%r785, %r786}, %r787, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x4.max.abs.NaN.f32 {%r788, %r789, %r790, %r791}, %r792, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x8.max.abs.NaN.f32 {%r793, %r794, %r795, %r796, %r797, %r798, %r799, %r800}, %r801, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x16.max.abs.NaN.f32 {%r802, %r803, %r804, %r805, %r806, %r807, %r808, %r809, %r810, %r811, %r812, %r813, %r814, %r815, %r816, %r817}, %r818, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x32.max.abs.NaN.f32 {%r819, %r820, %r821, %r822, %r823, %r824, %r825, %r826, %r827, %r828, %r829, %r830, %r831, %r832, %r833, %r834, %r835, %r836, %r837, %r838, %r839, %r840, %r841, %r842, %r843, %r844, %r845, %r846, %r847, %r848, %r849, %r850}, %r851, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x64.max.abs.NaN.f32 {%r852, %r853, %r854, %r855, %r856, %r857, %r858, %r859, %r860, %r861, %r862, %r863, %r864, %r865, %r866, %r867, %r868, %r869, %r870, %r871, %r872, %r873, %r874, %r875, %r876, %r877, %r878, %r879, %r880, %r881, %r882, %r883, %r884, %r885, %r886, %r887, %r888, %r889, %r890, %r891, %r892, %r893, %r894, %r895, %r896, %r897, %r898, %r899, %r900, %r901, %r902, %r903, %r904, %r905, %r906, %r907, %r908, %r909, %r910, %r911, %r912, %r913, %r914, %r915}, %r916, [%r1], 0;
+; CHECK-NEXT: tcgen05.ld.red.sync.aligned.16x32bx2.x128.max.abs.NaN.f32 {%r917, %r918, %r919, %r920, %r921, %r922, %r923, %r924, %r925, %r926, %r927, %r928, %r929, %r930, %r931, %r932, %r933, %r934, %r935, %r936, %r937, %r938, %r939, %r940, %r941, %r942, %r943, %r944, %r945, %r946, %r947, %r948, %r949, %r950, %r951, %r952, %r953, %r954, %r955, %r956, %r957, %r958, %r959, %r960, %r961, %r962, %r963, %r964, %r965, %r966, %r967, %r968, %r969, %r970, %r971, %r972, %r973, %r974, %r975, %r976, %r977, %r978, %r979, %r980, %r981, %r982, %r983, %r984, %r985, %r986, %r987, %r988, %r989, %r990, %r991, %r992, %r993, %r994, %r995, %r996, %r997, %r998, %r999, %r1000, %r1001, %r1002, %r1003, %r1004, %r1005, %r1006, %r1007, %r1008, %r1009, %r1010, %r1011, %r1012, %r1013, %r1014, %r1015, %r1016, %r1017, %r1018, %r1019, %r1020, %r1021, %r1022, %r1023, %r1024, %r1025, %r1026, %r1027, %r1028, %r1029, %r1030, %r1031, %r1032, %r1033, %r1034, %r1035, %r1036, %r1037, %r1038, %r1039, %r1040, %r1041, %r1042, %r1043, %r1044}, %r1045, [%r1], 0;
+; CHECK-NEXT: ret;
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 0)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 0, i1 1)
+
+ tail call {<2 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<4 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<8 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<16 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<32 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<64 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ tail call {<128 x float>, float} @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %taddr, i64 0, i32 1, i1 1, i1 1)
+ ret void
+}
More information about the llvm-commits
mailing list