[Mlir-commits] [mlir] [mlir][NVVM] Add support for tcgen05.ld.red Op (PR #177330)
Pradeep Kumar
llvmlistbot at llvm.org
Thu Jan 22 23:33:31 PST 2026
https://github.com/schwarzschild-radius updated https://github.com/llvm/llvm-project/pull/177330
>From 842777e9e2c7032814b1693599be60893ae6ca5e Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Thu, 22 Jan 2026 07:18:25 +0000
Subject: [PATCH 1/2] [mlir][NVVM] Add support for tcgen05.ld.red Op
This commit adds support for tcgen05.ld.red Op with tests under tcgen05-ld-red.mlir and negative tests under tcgen05-ld-red-invalid.mlir
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 105 ++++
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 82 +++
.../LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir | 15 +
.../Target/LLVMIR/nvvm/tcgen05-ld-red.mlir | 475 ++++++++++++++++++
4 files changed, 677 insertions(+)
create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 64a52acbb2278..8168ef050b9f5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -5322,6 +5322,111 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSMa<[100, 101]>]> {
}];
}
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.ld.red Op
+//===----------------------------------------------------------------------===//
+
+def Tcgen05LdRedMin: I32EnumAttrCase<"MIN", 0, "min">;
+def Tcgen05LdRedMax: I32EnumAttrCase<"MAX", 1, "max">;
+
+def Tcgen05LdRedOperation: I32EnumAttr<
+ "Tcgen05LdRedOperation",
+ "tcgen05.ld.red reduction operation",
+ [Tcgen05LdRedMin, Tcgen05LdRedMax]> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+
+def Tcgen05LdRedOperationAttr:
+ EnumAttr<NVVM_Dialect, Tcgen05LdRedOperation, "tcgen05_ld_red_op"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_Tcgen05LdRedOp : NVVM_Op<"tcgen05.ld.red", [NVVMRequiresSMa<[101]>]> {
+ let summary = "tensor memory load and reduce instructions";
+ let arguments = (ins
+ Tcgen05LdStShapeAttr:$shape,
+ Tcgen05LdRedOperationAttr:$op,
+ UnitAttr:$abs,
+ UnitAttr:$nan,
+ LLVM_PointerTensor:$addr,
+ Optional<I64>:$offset
+ );
+
+ let results = (outs VectorOfLengthAndType<[2, 4, 8, 16, 32, 64, 128],
+ [I32, F32]>:$data,
+ AnyTypeOf<[I32, F32]>:$redVal);
+
+ let assemblyFormat = [{
+ $addr (`,` $offset^)? attr-dict `:` type($data) `,` type($redVal)
+ }];
+
+ let description = [{
+ Instruction `tcgen05.ld.red` asynchronously loads data from the Tensor
+ Memory at the location specified by the 32-bit address operand `addr` into
+ the destination register `data`, collectively across all threads of the
+ warps. The operaiton also performs reduction operation specified by `op` on
+ the loaded data across columns in each lane and stored into `redVal`
+
+ The `shape` and the `num` attribute together determines the total
+ dimension of the data which is loaded from the Tensor Memory. The `shape`
+ attribute indicates the base dimension of data to be accessed as described
+ in the Data Movement Shape. The `num` attribute indicates the repeat
+ factor on the base dimension resulting in the total dimension of the data
+ that is accessed.
+
+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
+ `16x32b`. The base address of the first access is specified by `addr`
+ and the base address of the second access is specified by
+ `addr + offset`, where `offset` is an immediate argument.
+
+ The following table describes the size of the vector for various combinations
+ of `num` and `shape` attributes:
+ ```
+ |=============================================|
+ | num/shape | 16x32bx2/32x32b |
+ |=============================================|
+ | x2 | 2 |
+ | x4 | 4 |
+ | x8 | 8 |
+ | x16 | 16 |
+ | x32 | 32 |
+ | x64 | 64 |
+ | x128 | 128 |
+ |=============================================|
+ ```
+
+ Example:
+ ```mlir
+ %data, %redval = nvvm.tcgen05.ld,red %addr, %offset {
+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
+ } : <2xi32>, i32
+
+ %data, %redval = nvvm.tcgen05.ld,red %addr {
+ shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>,
+ } : <2xf32>, f32
+ ```
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld)
+ }];
+
+ let hasVerifier = 1;
+
+ let extraClassDeclaration = [{
+ static mlir::NVVM::IDArgPair
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase &builder);
+ }];
+
+ string llvmBuilder = [{
+ auto [id, args] = NVVM::Tcgen05LdRedOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ llvm::Value *result = createIntrinsicCall(builder, id, args);
+ $data = builder.CreateExtractValue(result, {0});
+ $redVal = builder.CreateExtractValue(result, {1});
+ }];
+}
+
//===----------------------------------------------------------------------===//
// NVVM tcgen05.st Op
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 6ce80c7456d6a..3a97529fa54df 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -5494,6 +5494,88 @@ mlir::NVVM::IDArgPair Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs(
return {ID, args};
}
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.ld.red functions
+//===----------------------------------------------------------------------===//
+
+#define TCGEN05LDRED(SHAPE, NUM, TYPE) \
+ llvm::Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_##NUM##_##TYPE
+
+mlir::NVVM::IDArgPair NVVM::Tcgen05LdRedOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::Tcgen05LdRedOp>(op);
+ llvm::SmallVector<llvm::Value *> args;
+
+ mlir::VectorType VecResTy =
+ cast<mlir::VectorType>(thisOp.getData().getType());
+ unsigned Num = VecResTy.getNumElements();
+ bool IsFloat = thisOp.getRedVal().getType().isF32();
+
+ llvm::Intrinsic::ID Shape32x32b[][2] = {
+ {notIntrinsic, notIntrinsic},
+ {TCGEN05LDRED(32x32b, x2, i32), TCGEN05LDRED(32x32b, x2, f32)},
+ {TCGEN05LDRED(32x32b, x4, i32), TCGEN05LDRED(32x32b, x4, f32)},
+ {TCGEN05LDRED(32x32b, x8, i32), TCGEN05LDRED(32x32b, x8, f32)},
+ {TCGEN05LDRED(32x32b, x16, i32), TCGEN05LDRED(32x32b, x16, f32)},
+ {TCGEN05LDRED(32x32b, x32, i32), TCGEN05LDRED(32x32b, x32, f32)},
+ {TCGEN05LDRED(32x32b, x64, i32), TCGEN05LDRED(32x32b, x64, f32)},
+ {TCGEN05LDRED(32x32b, x128, i32), TCGEN05LDRED(32x32b, x128, f32)},
+ };
+
+ llvm::Intrinsic::ID Shape16x32bx2[][2] = {
+ {notIntrinsic, notIntrinsic},
+ {TCGEN05LDRED(16x32bx2, x2, i32), TCGEN05LDRED(16x32bx2, x2, f32)},
+ {TCGEN05LDRED(16x32bx2, x4, i32), TCGEN05LDRED(16x32bx2, x4, f32)},
+ {TCGEN05LDRED(16x32bx2, x8, i32), TCGEN05LDRED(16x32bx2, x8, f32)},
+ {TCGEN05LDRED(16x32bx2, x16, i32), TCGEN05LDRED(16x32bx2, x16, f32)},
+ {TCGEN05LDRED(16x32bx2, x32, i32), TCGEN05LDRED(16x32bx2, x32, f32)},
+ {TCGEN05LDRED(16x32bx2, x64, i32), TCGEN05LDRED(16x32bx2, x64, f32)},
+ {TCGEN05LDRED(16x32bx2, x128, i32), TCGEN05LDRED(16x32bx2, x128, f32)},
+ };
+
+ NVVM::Tcgen05LdStShape shape = thisOp.getShape();
+ unsigned ID = [&]() {
+ // `num` contains the length of vector and log2 of `num` returns the index
+ // into the shape array
+ unsigned idx = std::log2(Num);
+ switch (shape) {
+ case NVVM::Tcgen05LdStShape::SHAPE_32X32B:
+ return Shape32x32b[idx][IsFloat];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2:
+ return Shape16x32bx2[idx][IsFloat];
+ default:
+ llvm_unreachable("unhandled tcgen05.ld lowering");
+ }
+ }();
+
+ args.push_back(mt.lookupValue(thisOp.getAddr()));
+
+ if (shape == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2)
+ args.push_back(mt.lookupValue(thisOp.getOffset()));
+
+ args.push_back(builder.getInt32(static_cast<unsigned>(thisOp.getOp())));
+
+ if (IsFloat) {
+ args.push_back(builder.getInt1(static_cast<unsigned>(thisOp.getAbs())));
+ args.push_back(builder.getInt1(static_cast<unsigned>(thisOp.getNan())));
+ }
+ return {ID, args};
+}
+
+LogicalResult Tcgen05LdRedOp::verify() {
+ VectorType data = cast<VectorType>(getData().getType());
+ Type redVal = getRedVal().getType();
+
+ if (data.getElementType() != redVal)
+ return emitError(
+ "type of reduction value and element type of vector data should match");
+
+ if (redVal.isInteger() && (getAbs() || getNan())) {
+ return emitError("abs or nan is only applicable for f32 type");
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// NVVMDialect initialization, type parsing, and registration.
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
new file mode 100644
index 0000000000000..9c77eca62b79b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s
+
+llvm.func @tcgen05_ld_red_same_types(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{type of reduction value and element type of vector data should match}}
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, f32
+ llvm.return
+}
+
+// -----
+
+llvm.func @tcgen05_ld_red_i32_abs_nan(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{abs or nan is only applicable for f32 type}}
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<2 x i32>, i32
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
new file mode 100644
index 0000000000000..84c54caf043d6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
@@ -0,0 +1,475 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_min
+llvm.func @nvvm_tcgen05_ld_32x32b_min(%addr : !llvm.ptr<6>) {
+
+ // CHECK: {{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 0
+ // CHECK: %{{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 1
+ // CHECK: %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x i32>, i32
+
+ %data3, %redval3 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x i32>, i32
+
+ %data4, %redval4 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x i32>, i32
+
+ %data5, %redval5 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x i32>, i32
+
+ %data6, %redval6 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x i32>, i32
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_max
+llvm.func @nvvm_tcgen05_ld_32x32b_max(%addr : !llvm.ptr<6>) {
+
+ // CHECK: {{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 0
+ // CHECK: %{{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 1
+ // CHECK: %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x i32>, i32
+
+ %data3, %redval3 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x i32>, i32
+
+ %data4, %redval4 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x i32>, i32
+
+ %data5, %redval5 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x i32>, i32
+
+ %data6, %redval6 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x i32>, i32
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_min_abs_nan
+llvm.func @nvvm_tcgen05_ld_32x32b_min_abs_nan(%addr : !llvm.ptr<6>) {
+
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_max_abs_nan
+llvm.func @nvvm_tcgen05_ld_32x32b_max_abs_nan(%addr : !llvm.ptr<6>) {
+
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 true, i1 true)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_min
+llvm.func @nvvm_tcgen05_ld_16x32bx2_min(%addr : !llvm.ptr<6>) {
+
+ %offset = llvm.mlir.constant(0: i64) : i64
+
+ // CHECK %{{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <2 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <4 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <8 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 0)
+ // CHECK %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x i32>, i32
+
+ %data3, %redval3 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x i32>, i32
+
+ %data4, %redval4 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x i32>, i32
+
+ %data5, %redval5 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x i32>, i32
+
+ %data6, %redval6 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x i32>, i32
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_max
+llvm.func @nvvm_tcgen05_ld_16x32bx2_max(%addr : !llvm.ptr<6>) {
+
+ %offset = llvm.mlir.constant(0: i64) : i64
+
+ // CHECK %{{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <2 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <4 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <8 x i32>, i32 } {{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x i32>, i32 } {{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.i32(ptr addrspace(6) %{{.*}}, i64 0, i32 1)
+ // CHECK %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 false, i1 false)
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x i32>, i32
+
+ %data3, %redval3 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x i32>, i32
+
+ %data4, %redval4 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x i32>, i32
+
+ %data5, %redval5 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x i32>, i32
+
+ %data6, %redval6 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x i32>, i32
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_min_nan_abs
+llvm.func @nvvm_tcgen05_ld_16x32bx2_min_nan_abs(%addr : !llvm.ptr<6>) {
+
+ %offset = llvm.mlir.constant(0: i64) : i64
+ // CHECK %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 0, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_max_nan_abs
+llvm.func @nvvm_tcgen05_ld_16x32bx2_max_nan_abs(%addr : !llvm.ptr<6>) {
+
+ %offset = llvm.mlir.constant(0: i64) : i64
+ // CHECK %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x2.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x4.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x8.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x16.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x32.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x64.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.16x32bx2.x128.f32(ptr addrspace(6) %{{.*}}, i64 0, i32 1, i1 true, i1 true)
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<128 x f32>, f32
+ llvm.return
+}
>From add88faadb5a2b3bafecc9aa5b18b8463e500916 Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Thu, 22 Jan 2026 11:51:02 +0000
Subject: [PATCH 2/2] Rename ReduxKind to ReductionKind
The commit contains the following changes:
- Rename ReduxKind to ReductionKind and across other files
- Replace Tcgen05LdRedOperation with ReductionKind and update .cpp and test files
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 75 ++++----
.../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 23 +--
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 33 ++--
.../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 24 +--
.../LLVMIR/nvvm/redux-sync-invalid.mlir | 6 +-
.../LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir | 29 ++-
.../Target/LLVMIR/nvvm/tcgen05-ld-red.mlir | 168 +++++++++---------
mlir/test/python/dialects/nvvm.py | 18 +-
8 files changed, 197 insertions(+), 179 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8168ef050b9f5..37f11150ddb4a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -232,6 +232,30 @@ def MemOrderKindAttr : EnumAttr<NVVM_Dialect, MemOrderKind, "mem_order"> {
let assemblyFormat = "`<` $value `>`";
}
+// Attrs for supported Reduction operations
+def ReductionKindNone : I32EnumAttrCase<"NONE", 0, "none">;
+def ReductionKindAdd : I32EnumAttrCase<"ADD", 1, "add">;
+def ReductionKindAnd : I32EnumAttrCase<"AND", 2, "and">;
+def ReductionKindMax : I32EnumAttrCase<"MAX", 3, "max">;
+def ReductionKindMin : I32EnumAttrCase<"MIN", 4, "min">;
+def ReductionKindOr : I32EnumAttrCase<"OR", 5, "or">;
+def ReductionKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">;
+def ReductionKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">;
+def ReductionKindXor : I32EnumAttrCase<"XOR", 8, "xor">;
+def ReductionKindFmin : I32EnumAttrCase<"FMIN", 9, "fmin">;
+def ReductionKindFmax : I32EnumAttrCase<"FMAX", 10, "fmax">;
+
+/// Enum attribute of the different kinds.
+def ReductionKind : I32EnumAttr<"ReductionKind", "NVVM Reduction Kind attribute",
+ [ReductionKindAdd, ReductionKindAnd, ReductionKindMax, ReductionKindMin,
+ ReductionKindOr, ReductionKindUmax, ReductionKindUmin, ReductionKindXor,
+ ReductionKindFmin, ReductionKindFmax]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+
+def ReductionKindAttr : EnumAttr<NVVM_Dialect, ReductionKind, "reduction_kind">;
+
//===----------------------------------------------------------------------===//
// NVVM intrinsic operations
//===----------------------------------------------------------------------===//
@@ -493,33 +517,11 @@ def NVVM_RcpApproxFtzF32Op : NVVM_IntrOp<"rcp.approx.ftz.f", [Pure], 1> {
// NVVM redux op definitions
//===----------------------------------------------------------------------===//
-def ReduxKindNone : I32EnumAttrCase<"NONE", 0, "none">;
-def ReduxKindAdd : I32EnumAttrCase<"ADD", 1, "add">;
-def ReduxKindAnd : I32EnumAttrCase<"AND", 2, "and">;
-def ReduxKindMax : I32EnumAttrCase<"MAX", 3, "max">;
-def ReduxKindMin : I32EnumAttrCase<"MIN", 4, "min">;
-def ReduxKindOr : I32EnumAttrCase<"OR", 5, "or">;
-def ReduxKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">;
-def ReduxKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">;
-def ReduxKindXor : I32EnumAttrCase<"XOR", 8, "xor">;
-def ReduxKindFmin : I32EnumAttrCase<"FMIN", 9, "fmin">;
-def ReduxKindFmax : I32EnumAttrCase<"FMAX", 10, "fmax">;
-
-/// Enum attribute of the different kinds.
-def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
- [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr,
- ReduxKindUmax, ReduxKindUmin, ReduxKindXor, ReduxKindFmin, ReduxKindFmax]> {
- let genSpecializedAttr = 0;
- let cppNamespace = "::mlir::NVVM";
-}
-
-def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">;
-
def NVVM_ReduxOp :
NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>,
Results<(outs AnyTypeOf<[I32, F32]>:$res)>,
Arguments<(ins AnyTypeOf<[I32, F32]>:$val,
- ReduxKindAttr:$kind,
+ ReductionKindAttr:$kind,
I32:$mask_and_clamp,
DefaultValuedAttr<BoolAttr, "false">:$abs,
DefaultValuedAttr<BoolAttr, "false">:$nan)> {
@@ -5326,27 +5328,12 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSMa<[100, 101]>]> {
// NVVM tcgen05.ld.red Op
//===----------------------------------------------------------------------===//
-def Tcgen05LdRedMin: I32EnumAttrCase<"MIN", 0, "min">;
-def Tcgen05LdRedMax: I32EnumAttrCase<"MAX", 1, "max">;
-
-def Tcgen05LdRedOperation: I32EnumAttr<
- "Tcgen05LdRedOperation",
- "tcgen05.ld.red reduction operation",
- [Tcgen05LdRedMin, Tcgen05LdRedMax]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05LdRedOperationAttr:
- EnumAttr<NVVM_Dialect, Tcgen05LdRedOperation, "tcgen05_ld_red_op"> {
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05LdRedOp : NVVM_Op<"tcgen05.ld.red", [NVVMRequiresSMa<[101]>]> {
- let summary = "tensor memory load and reduce instructions";
+def NVVM_Tcgen05LdRedOp : NVVM_Op<"tcgen05.ld.red",
+ [NVVMRequiresSMa<[101, 110]>]> {
+ let summary = "Tcgen05 tensor memory load and reduce instructions";
let arguments = (ins
Tcgen05LdStShapeAttr:$shape,
- Tcgen05LdRedOperationAttr:$op,
+ ReductionKindAttr:$op,
UnitAttr:$abs,
UnitAttr:$nan,
LLVM_PointerTensor:$addr,
@@ -5358,14 +5345,14 @@ def NVVM_Tcgen05LdRedOp : NVVM_Op<"tcgen05.ld.red", [NVVMRequiresSMa<[101]>]> {
AnyTypeOf<[I32, F32]>:$redVal);
let assemblyFormat = [{
- $addr (`,` $offset^)? attr-dict `:` type($data) `,` type($redVal)
+ $op $addr (`,` $offset^)? attr-dict `:` type($data) `,` type($redVal)
}];
let description = [{
Instruction `tcgen05.ld.red` asynchronously loads data from the Tensor
Memory at the location specified by the 32-bit address operand `addr` into
the destination register `data`, collectively across all threads of the
- warps. The operaiton also performs reduction operation specified by `op` on
+ warp. The operation also performs reduction operation specified by `op` on
the loaded data across columns in each lane and stored into `redVal`
The `shape` and the `num` attribute together determines the total
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 6394296e99b9e..5fdfc9fa8cdb6 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -61,31 +61,31 @@ static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
llvm_unreachable("unknown shuffle mode");
}
-static std::optional<NVVM::ReduxKind>
-convertReduxKind(gpu::AllReduceOperation mode) {
+static std::optional<NVVM::ReductionKind>
+convertToNVVMReductionKind(gpu::AllReduceOperation mode) {
switch (mode) {
case gpu::AllReduceOperation::ADD:
- return NVVM::ReduxKind::ADD;
+ return NVVM::ReductionKind::ADD;
case gpu::AllReduceOperation::MUL:
return std::nullopt;
case gpu::AllReduceOperation::MINSI:
- return NVVM::ReduxKind::MIN;
+ return NVVM::ReductionKind::MIN;
case gpu::AllReduceOperation::MINUI:
return std::nullopt;
case gpu::AllReduceOperation::MINNUMF:
- return NVVM::ReduxKind::MIN;
+ return NVVM::ReductionKind::MIN;
case gpu::AllReduceOperation::MAXSI:
- return NVVM::ReduxKind::MAX;
+ return NVVM::ReductionKind::MAX;
case gpu::AllReduceOperation::MAXUI:
return std::nullopt;
case gpu::AllReduceOperation::MAXNUMF:
- return NVVM::ReduxKind::MAX;
+ return NVVM::ReductionKind::MAX;
case gpu::AllReduceOperation::AND:
- return NVVM::ReduxKind::AND;
+ return NVVM::ReductionKind::AND;
case gpu::AllReduceOperation::OR:
- return NVVM::ReduxKind::OR;
+ return NVVM::ReductionKind::OR;
case gpu::AllReduceOperation::XOR:
- return NVVM::ReduxKind::XOR;
+ return NVVM::ReductionKind::XOR;
case gpu::AllReduceOperation::MINIMUMF:
case gpu::AllReduceOperation::MAXIMUMF:
return std::nullopt;
@@ -113,7 +113,8 @@ struct GPUSubgroupReduceOpLowering
if (!op.getValue().getType().isInteger(32))
return rewriter.notifyMatchFailure(op, "unsupported data type");
- std::optional<NVVM::ReduxKind> mode = convertReduxKind(op.getOp());
+ std::optional<NVVM::ReductionKind> mode =
+ convertToNVVMReductionKind(op.getOp());
if (!mode.has_value())
return rewriter.notifyMatchFailure(
op, "unsupported reduction mode for redux");
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 3a97529fa54df..2d55689ef89e6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2979,26 +2979,26 @@ LogicalResult NVVM::ReduxOp::verify() {
return emitOpError("nan attribute is supported only for f32 type");
}
- NVVM::ReduxKind kind = getKind();
+ NVVM::ReductionKind kind = getKind();
switch (kind) {
- case NVVM::ReduxKind::ADD:
- case NVVM::ReduxKind::AND:
- case NVVM::ReduxKind::OR:
- case NVVM::ReduxKind::XOR:
- case NVVM::ReduxKind::MAX:
- case NVVM::ReduxKind::MIN:
- case NVVM::ReduxKind::UMAX:
- case NVVM::ReduxKind::UMIN:
+ case NVVM::ReductionKind::ADD:
+ case NVVM::ReductionKind::AND:
+ case NVVM::ReductionKind::OR:
+ case NVVM::ReductionKind::XOR:
+ case NVVM::ReductionKind::MAX:
+ case NVVM::ReductionKind::MIN:
+ case NVVM::ReductionKind::UMAX:
+ case NVVM::ReductionKind::UMIN:
if (!reduxType.isInteger(32))
return emitOpError("'")
- << stringifyEnum(kind) << "' redux kind unsupported with "
+ << stringifyEnum(kind) << "' reduction kind unsupported with "
<< reduxType << " type. Only supported type is 'i32'.";
break;
- case NVVM::ReduxKind::FMIN:
- case NVVM::ReduxKind::FMAX:
+ case NVVM::ReductionKind::FMIN:
+ case NVVM::ReductionKind::FMAX:
if (!reduxType.isF32())
return emitOpError("'")
- << stringifyEnum(kind) << "' redux kind unsupported with "
+ << stringifyEnum(kind) << "' reduction kind unsupported with "
<< reduxType << " type. Only supported type is 'f32'.";
break;
}
@@ -5553,7 +5553,8 @@ mlir::NVVM::IDArgPair NVVM::Tcgen05LdRedOp::getIntrinsicIDAndArgs(
if (shape == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2)
args.push_back(mt.lookupValue(thisOp.getOffset()));
- args.push_back(builder.getInt32(static_cast<unsigned>(thisOp.getOp())));
+ args.push_back(
+ builder.getInt32(thisOp.getOp() == NVVM::ReductionKind::MIN ? 0 : 1));
if (IsFloat) {
args.push_back(builder.getInt1(static_cast<unsigned>(thisOp.getAbs())));
@@ -5570,6 +5571,10 @@ LogicalResult Tcgen05LdRedOp::verify() {
return emitError(
"type of reduction value and element type of vector data should match");
+ if (getOp() != NVVM::ReductionKind::MIN &&
+ getOp() != NVVM::ReductionKind::MAX)
+ return emitError("only min and max reduction operations are supported");
+
if (redVal.isInteger() && (getAbs() || getNan())) {
return emitError("abs or nan is only applicable for f32 type");
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index b7427a559fb79..dd7a6e76f7569 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -34,31 +34,31 @@ using mlir::LLVM::detail::createIntrinsicCall;
hasAbs ? REDUX_F32_ID_IMPL(op, _abs, hasNaN) : REDUX_F32_ID_IMPL(op, , hasNaN)
static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType,
- NVVM::ReduxKind kind,
+ NVVM::ReductionKind kind,
bool hasAbs, bool hasNaN) {
switch (kind) {
- case NVVM::ReduxKind::ADD:
+ case NVVM::ReductionKind::ADD:
return llvm::Intrinsic::nvvm_redux_sync_add;
- case NVVM::ReduxKind::UMAX:
+ case NVVM::ReductionKind::UMAX:
return llvm::Intrinsic::nvvm_redux_sync_umax;
- case NVVM::ReduxKind::UMIN:
+ case NVVM::ReductionKind::UMIN:
return llvm::Intrinsic::nvvm_redux_sync_umin;
- case NVVM::ReduxKind::AND:
+ case NVVM::ReductionKind::AND:
return llvm::Intrinsic::nvvm_redux_sync_and;
- case NVVM::ReduxKind::OR:
+ case NVVM::ReductionKind::OR:
return llvm::Intrinsic::nvvm_redux_sync_or;
- case NVVM::ReduxKind::XOR:
+ case NVVM::ReductionKind::XOR:
return llvm::Intrinsic::nvvm_redux_sync_xor;
- case NVVM::ReduxKind::MAX:
+ case NVVM::ReductionKind::MAX:
return llvm::Intrinsic::nvvm_redux_sync_max;
- case NVVM::ReduxKind::MIN:
+ case NVVM::ReductionKind::MIN:
return llvm::Intrinsic::nvvm_redux_sync_min;
- case NVVM::ReduxKind::FMIN:
+ case NVVM::ReductionKind::FMIN:
return GET_REDUX_F32_ID(min, hasAbs, hasNaN);
- case NVVM::ReduxKind::FMAX:
+ case NVVM::ReductionKind::FMAX:
return GET_REDUX_F32_ID(max, hasAbs, hasNaN);
}
- llvm_unreachable("unknown redux kind");
+ llvm_unreachable("unknown reduction kind");
}
static llvm::Intrinsic::ID getShflIntrinsicId(llvm::Type *resultType,
diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
index a8a743006fbf8..f91f68852fb7e 100644
--- a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
@@ -19,7 +19,7 @@ llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) {
// -----
llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) {
- // expected-error at +1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+ // expected-error at +1 {{'add' reduction kind unsupported with 'f32' type. Only supported type is 'i32'.}}
%res = nvvm.redux.sync add %value, %offset: f32 -> f32
llvm.return
}
@@ -27,7 +27,7 @@ llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) {
// -----
llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) {
- // expected-error at +1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+ // expected-error at +1 {{'and' reduction kind unsupported with 'f32' type. Only supported type is 'i32'.}}
%res = nvvm.redux.sync and %value, %offset: f32 -> f32
llvm.return
}
@@ -35,7 +35,7 @@ llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) {
// -----
llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) {
- // expected-error at +1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}}
+ // expected-error at +1 {{'fmin' reduction kind unsupported with 'i32' type. Only supported type is 'f32'.}}
%res = nvvm.redux.sync fmin %value, %offset: i32 -> i32
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
index 9c77eca62b79b..15840f5e96ed9 100644
--- a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
@@ -1,8 +1,33 @@
// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s
+llvm.func @tcgen05_ld_red_add(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{only min and max reduction kinds are supported}}
+ %data, %redval = nvvm.tcgen05.ld.red add %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>, i32
+ llvm.return
+}
+
+// -----
+
llvm.func @tcgen05_ld_red_same_types(%addr : !llvm.ptr<6>) {
// expected-error @below {{type of reduction value and element type of vector data should match}}
- %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, f32
+ %data, %redval = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>, f32
+ llvm.return
+}
+
+// -----
+
+llvm.func @tcgen05_ld_red_i32_abs_nan(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{abs or nan is only applicable for f32 type}}
+ %data, %redval = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, nan, abs} : vector<2 x i32>, i32
+ llvm.return
+}
+
+
+// -----
+
+llvm.func @tcgen05_ld_red_i32_abs_nan(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{abs or nan is only applicable for f32 type}}
+ %data, %redval = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs} : vector<2 x i32>, i32
llvm.return
}
@@ -10,6 +35,6 @@ llvm.func @tcgen05_ld_red_same_types(%addr : !llvm.ptr<6>) {
llvm.func @tcgen05_ld_red_i32_abs_nan(%addr : !llvm.ptr<6>) {
// expected-error @below {{abs or nan is only applicable for f32 type}}
- %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<2 x i32>, i32
+ %data, %redval = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, nan} : vector<2 x i32>, i32
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
index 84c54caf043d6..492c22ecf2fd3 100644
--- a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
@@ -46,33 +46,33 @@ llvm.func @nvvm_tcgen05_ld_32x32b_min(%addr : !llvm.ptr<6>) {
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, i32
+ %data, %redval = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>, i32
- %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x i32>, i32
+ %data1, %redval1 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x i32>, i32
- %data2, %redval2 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x i32>, i32
+ %data2, %redval2 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x i32>, i32
- %data3, %redval3 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x i32>, i32
+ %data3, %redval3 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x i32>, i32
- %data4, %redval4 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x i32>, i32
+ %data4, %redval4 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x i32>, i32
- %data5, %redval5 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x i32>, i32
+ %data5, %redval5 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x i32>, i32
- %data6, %redval6 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x i32>, i32
+ %data6, %redval6 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x i32>, i32
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x f32>, f32
llvm.return
}
@@ -122,33 +122,33 @@ llvm.func @nvvm_tcgen05_ld_32x32b_max(%addr : !llvm.ptr<6>) {
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x i32>, i32
+ %data, %redval = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>, i32
- %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x i32>, i32
+ %data1, %redval1 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x i32>, i32
- %data2, %redval2 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x i32>, i32
+ %data2, %redval2 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x i32>, i32
- %data3, %redval3 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x i32>, i32
+ %data3, %redval3 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x i32>, i32
- %data4, %redval4 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x i32>, i32
+ %data4, %redval4 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x i32>, i32
- %data5, %redval5 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x i32>, i32
+ %data5, %redval5 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x i32>, i32
- %data6, %redval6 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x i32>, i32
+ %data6, %redval6 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x i32>, i32
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x f32>, f32
llvm.return
}
@@ -177,19 +177,19 @@ llvm.func @nvvm_tcgen05_ld_32x32b_min_abs_nan(%addr : !llvm.ptr<6>) {
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, abs, nan} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red min %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<128 x f32>, f32
llvm.return
}
@@ -218,19 +218,19 @@ llvm.func @nvvm_tcgen05_ld_32x32b_max_abs_nan(%addr : !llvm.ptr<6>) {
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>, abs, nan} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red max %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, abs, nan} : vector<128 x f32>, f32
llvm.return
}
@@ -282,33 +282,33 @@ llvm.func @nvvm_tcgen05_ld_16x32bx2_min(%addr : !llvm.ptr<6>) {
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data, %redval = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, i32
+ %data, %redval = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x i32>, i32
- %data1, %redval1 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x i32>, i32
+ %data1, %redval1 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x i32>, i32
- %data2, %redval2 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x i32>, i32
+ %data2, %redval2 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x i32>, i32
- %data3, %redval3 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x i32>, i32
+ %data3, %redval3 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x i32>, i32
- %data4, %redval4 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x i32>, i32
+ %data4, %redval4 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x i32>, i32
- %data5, %redval5 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x i32>, i32
+ %data5, %redval5 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x i32>, i32
- %data6, %redval6 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x i32>, i32
+ %data6, %redval6 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x i32>, i32
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x f32>, f32
llvm.return
}
@@ -360,33 +360,33 @@ llvm.func @nvvm_tcgen05_ld_16x32bx2_max(%addr : !llvm.ptr<6>) {
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data, %redval = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x i32>, i32
+ %data, %redval = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x i32>, i32
- %data1, %redval1 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x i32>, i32
+ %data1, %redval1 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x i32>, i32
- %data2, %redval2 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x i32>, i32
+ %data2, %redval2 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x i32>, i32
- %data3, %redval3 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x i32>, i32
+ %data3, %redval3 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x i32>, i32
- %data4, %redval4 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x i32>, i32
+ %data4, %redval4 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x i32>, i32
- %data5, %redval5 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x i32>, i32
+ %data5, %redval5 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x i32>, i32
- %data6, %redval6 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x i32>, i32
+ %data6, %redval6 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x i32>, i32
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x f32>, f32
llvm.return
}
@@ -416,19 +416,19 @@ llvm.func @nvvm_tcgen05_ld_16x32bx2_min_nan_abs(%addr : !llvm.ptr<6>) {
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red min %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<128 x f32>, f32
llvm.return
}
@@ -458,18 +458,18 @@ llvm.func @nvvm_tcgen05_ld_16x32bx2_max_nan_abs(%addr : !llvm.ptr<6>) {
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
// CHECK %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
- %data7, %redval7 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<2 x f32>, f32
+ %data7, %redval7 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<2 x f32>, f32
- %data8, %redval8 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<4 x f32>, f32
+ %data8, %redval8 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<4 x f32>, f32
- %data9, %redval9 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<8 x f32>, f32
+ %data9, %redval9 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<8 x f32>, f32
- %data10, %redval10 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<16 x f32>, f32
+ %data10, %redval10 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<16 x f32>, f32
- %data11, %redval11 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<32 x f32>, f32
+ %data11, %redval11 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<32 x f32>, f32
- %data12, %redval12 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<64 x f32>, f32
+ %data12, %redval12 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<64 x f32>, f32
- %data13, %redval13 = nvvm.tcgen05.ld.red %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, op = #nvvm.tcgen05_ld_red_op<max>, nan, abs} : vector<128 x f32>, f32
+ %data13, %redval13 = nvvm.tcgen05.ld.red max %addr, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, nan, abs} : vector<128 x f32>, f32
llvm.return
}
diff --git a/mlir/test/python/dialects/nvvm.py b/mlir/test/python/dialects/nvvm.py
index d795524222fd2..62236c31e5fdc 100644
--- a/mlir/test/python/dialects/nvvm.py
+++ b/mlir/test/python/dialects/nvvm.py
@@ -164,19 +164,19 @@ def reductions(mask, vi32, vf32):
for abs in (True, False):
for nan in (True, False):
for kind in (
- nvvm.ReduxKind.AND,
- nvvm.ReduxKind.MAX,
- nvvm.ReduxKind.MIN,
- nvvm.ReduxKind.OR,
- nvvm.ReduxKind.UMAX,
- nvvm.ReduxKind.UMIN,
- nvvm.ReduxKind.XOR,
+ nvvm.ReductionKind.AND,
+ nvvm.ReductionKind.MAX,
+ nvvm.ReductionKind.MIN,
+ nvvm.ReductionKind.OR,
+ nvvm.ReductionKind.UMAX,
+ nvvm.ReductionKind.UMIN,
+ nvvm.ReductionKind.XOR,
):
nvvm.redux_sync(i32, vi32, kind, vi32)
for kind in (
- nvvm.ReduxKind.FMIN,
- nvvm.ReduxKind.FMAX,
+ nvvm.ReductionKind.FMIN,
+ nvvm.ReductionKind.FMAX,
):
nvvm.redux_sync(f32, vf32, kind, vi32, abs=abs, nan=nan)
More information about the Mlir-commits
mailing list