[Mlir-commits] [mlir] [mlir][NVVM] Add support for tcgen05.ld.red Op (PR #177330)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Jan 22 02:03:53 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-llvm
Author: Pradeep Kumar (schwarzschild-radius)
<details>
<summary>Changes</summary>
This commit adds support for tcgen05.ld.red Op with tests under tcgen05-ld-red.mlir and negative tests under tcgen05-ld-red-invalid.mlir
---
Patch is 47.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/177330.diff
4 Files Affected:
- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+105)
- (modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+82)
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir (+15)
- (added) mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir (+475)
``````````diff
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 64a52acbb2278..8168ef050b9f5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -5322,6 +5322,111 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSMa<[100, 101]>]> {
}];
}
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.ld.red Op
+//===----------------------------------------------------------------------===//
+
+def Tcgen05LdRedMin: I32EnumAttrCase<"MIN", 0, "min">;
+def Tcgen05LdRedMax: I32EnumAttrCase<"MAX", 1, "max">;
+
+def Tcgen05LdRedOperation: I32EnumAttr<
+ "Tcgen05LdRedOperation",
+ "tcgen05.ld.red reduction operation",
+ [Tcgen05LdRedMin, Tcgen05LdRedMax]> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+
+def Tcgen05LdRedOperationAttr:
+ EnumAttr<NVVM_Dialect, Tcgen05LdRedOperation, "tcgen05_ld_red_op"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_Tcgen05LdRedOp : NVVM_Op<"tcgen05.ld.red", [NVVMRequiresSMa<[101]>]> {
+ let summary = "tensor memory load and reduce instructions";
+ let arguments = (ins
+ Tcgen05LdStShapeAttr:$shape,
+ Tcgen05LdRedOperationAttr:$op,
+ UnitAttr:$abs,
+ UnitAttr:$nan,
+ LLVM_PointerTensor:$addr,
+ Optional<I64>:$offset
+ );
+
+ let results = (outs VectorOfLengthAndType<[2, 4, 8, 16, 32, 64, 128],
+ [I32, F32]>:$data,
+ AnyTypeOf<[I32, F32]>:$redVal);
+
+ let assemblyFormat = [{
+ $addr (`,` $offset^)? attr-dict `:` type($data) `,` type($redVal)
+ }];
+
+ let description = [{
+ Instruction `tcgen05.ld.red` asynchronously loads data from the Tensor
+ Memory at the location specified by the 32-bit address operand `addr` into
+ the destination register `data`, collectively across all threads of the
+ warps. The operaiton also performs reduction operation specified by `op` on
+ the loaded data across columns in each lane and stored into `redVal`
+
+ The `shape` and the `num` attribute together determines the total
+ dimension of the data which is loaded from the Tensor Memory. The `shape`
+ attribute indicates the base dimension of data to be accessed as described
+ in the Data Movement Shape. The `num` attribute indicates the repeat
+ factor on the base dimension resulting in the total dimension of the data
+ that is accessed.
+
+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
+ `16x32b`. The base address of the first access is specified by `addr`
+ and the base address of the second access is specified by
+ `addr + offset`, where `offset` is an immediate argument.
+
+ The following table describes the size of the vector for various combinations
+ of `num` and `shape` attributes:
+ ```
+ |=============================================|
+ | num/shape | 16x32bx2/32x32b |
+ |=============================================|
+ | x2 | 2 |
+ | x4 | 4 |
+ | x8 | 8 |
+ | x16 | 16 |
+ | x32 | 32 |
+ | x64 | 64 |
+ | x128 | 128 |
+ |=============================================|
+ ```
+
+ Example:
+ ```mlir
+ %data, %redval = nvvm.tcgen05.ld,red %addr, %offset {
+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
+ } : <2xi32>, i32
+
+ %data, %redval = nvvm.tcgen05.ld,red %addr {
+ shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>,
+ } : <2xf32>, f32
+ ```
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld)
+ }];
+
+ let hasVerifier = 1;
+
+ let extraClassDeclaration = [{
+ static mlir::NVVM::IDArgPair
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase &builder);
+ }];
+
+ string llvmBuilder = [{
+ auto [id, args] = NVVM::Tcgen05LdRedOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ llvm::Value *result = createIntrinsicCall(builder, id, args);
+ $data = builder.CreateExtractValue(result, {0});
+ $redVal = builder.CreateExtractValue(result, {1});
+ }];
+}
+
//===----------------------------------------------------------------------===//
// NVVM tcgen05.st Op
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 6ce80c7456d6a..3a97529fa54df 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -5494,6 +5494,88 @@ mlir::NVVM::IDArgPair Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs(
return {ID, args};
}
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.ld.red functions
+//===----------------------------------------------------------------------===//
+
+#define TCGEN05LDRED(SHAPE, NUM, TYPE) \
+ llvm::Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_##NUM##_##TYPE
+
+mlir::NVVM::IDArgPair NVVM::Tcgen05LdRedOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::Tcgen05LdRedOp>(op);
+ llvm::SmallVector<llvm::Value *> args;
+
+ mlir::VectorType VecResTy =
+ cast<mlir::VectorType>(thisOp.getData().getType());
+ unsigned Num = VecResTy.getNumElements();
+ bool IsFloat = thisOp.getRedVal().getType().isF32();
+
+ llvm::Intrinsic::ID Shape32x32b[][2] = {
+ {notIntrinsic, notIntrinsic},
+ {TCGEN05LDRED(32x32b, x2, i32), TCGEN05LDRED(32x32b, x2, f32)},
+ {TCGEN05LDRED(32x32b, x4, i32), TCGEN05LDRED(32x32b, x4, f32)},
+ {TCGEN05LDRED(32x32b, x8, i32), TCGEN05LDRED(32x32b, x8, f32)},
+ {TCGEN05LDRED(32x32b, x16, i32), TCGEN05LDRED(32x32b, x16, f32)},
+ {TCGEN05LDRED(32x32b, x32, i32), TCGEN05LDRED(32x32b, x32, f32)},
+ {TCGEN05LDRED(32x32b, x64, i32), TCGEN05LDRED(32x32b, x64, f32)},
+ {TCGEN05LDRED(32x32b, x128, i32), TCGEN05LDRED(32x32b, x128, f32)},
+ };
+
+ llvm::Intrinsic::ID Shape16x32bx2[][2] = {
+ {notIntrinsic, notIntrinsic},
+ {TCGEN05LDRED(16x32bx2, x2, i32), TCGEN05LDRED(16x32bx2, x2, f32)},
+ {TCGEN05LDRED(16x32bx2, x4, i32), TCGEN05LDRED(16x32bx2, x4, f32)},
+ {TCGEN05LDRED(16x32bx2, x8, i32), TCGEN05LDRED(16x32bx2, x8, f32)},
+ {TCGEN05LDRED(16x32bx2, x16, i32), TCGEN05LDRED(16x32bx2, x16, f32)},
+ {TCGEN05LDRED(16x32bx2, x32, i32), TCGEN05LDRED(16x32bx2, x32, f32)},
+ {TCGEN05LDRED(16x32bx2, x64, i32), TCGEN05LDRED(16x32bx2, x64, f32)},
+ {TCGEN05LDRED(16x32bx2, x128, i32), TCGEN05LDRED(16x32bx2, x128, f32)},
+ };
+
+ NVVM::Tcgen05LdStShape shape = thisOp.getShape();
+ unsigned ID = [&]() {
+ // `num` contains the length of vector and log2 of `num` returns the index
+ // into the shape array
+ unsigned idx = std::log2(Num);
+ switch (shape) {
+ case NVVM::Tcgen05LdStShape::SHAPE_32X32B:
+ return Shape32x32b[idx][IsFloat];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2:
+ return Shape16x32bx2[idx][IsFloat];
+ default:
+ llvm_unreachable("unhandled tcgen05.ld lowering");
+ }
+ }();
+
+ args.push_back(mt.lookupValue(thisOp.getAddr()));
+
+ if (shape == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2)
+ args.push_back(mt.lookupValue(thisOp.getOffset()));
+
+ args.push_back(builder.getInt32(static_cast<unsigned>(thisOp.getOp())));
+
+ if (IsFloat) {
+ args.push_back(builder.getInt1(static_cast<unsigned>(thisOp.getAbs())));
+ args.push_back(builder.getInt1(static_cast<unsigned>(thisOp.getNan())));
+ }
+ return {ID, args};
+}
+
+LogicalResult Tcgen05LdRedOp::verify() {
+ VectorType data = cast<VectorType>(getData().getType());
+ Type redVal = getRedVal().getType();
+
+ if (data.getElementType() != redVal)
+ return emitError(
+ "type of reduction value and element type of vector data should match");
+
+ if (redVal.isInteger() && (getAbs() || getNan())) {
+ return emitError("abs or nan is only applicable for f32 type");
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// NVVMDialect initialization, type parsing, and registration.
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
new file mode 100644
index 0000000000000..9c77eca62b79b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red-invalid.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s
+
+llvm.func @tcgen05_ld_red_same_types(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{type of reduction value and element type of vector data should match}}
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, f32
+ llvm.return
+}
+
+// -----
+
+llvm.func @tcgen05_ld_red_i32_abs_nan(%addr : !llvm.ptr<6>) {
+ // expected-error @below {{abs or nan is only applicable for f32 type}}
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>, nan, abs} : vector<2 x i32>, i32
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
new file mode 100644
index 0000000000000..84c54caf043d6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-red.mlir
@@ -0,0 +1,475 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_min
+llvm.func @nvvm_tcgen05_ld_32x32b_min(%addr : !llvm.ptr<6>) {
+
+ // CHECK: {{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: {{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 0
+ // CHECK: %{{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 1
+ // CHECK: %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %{{.*}}, i32 0)
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 0, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x i32>, i32
+
+ %data3, %redval3 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x i32>, i32
+
+ %data4, %redval4 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x i32>, i32
+
+ %data5, %redval5 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x i32>, i32
+
+ %data6, %redval6 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x i32>, i32
+
+ %data7, %redval7 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<2 x f32>, f32
+
+ %data8, %redval8 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<4 x f32>, f32
+
+ %data9, %redval9 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<8 x f32>, f32
+
+ %data10, %redval10 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<16 x f32>, f32
+
+ %data11, %redval11 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<32 x f32>, f32
+
+ %data12, %redval12 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<64 x f32>, f32
+
+ %data13, %redval13 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<min>} : vector<128 x f32>, f32
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_max
+llvm.func @nvvm_tcgen05_ld_32x32b_max(%addr : !llvm.ptr<6>) {
+
+ // CHECK: {{.*}} = call { <2 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <2 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <4 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 0
+ // CHECK: {{.*}} = extractvalue { <4 x i32>, i32 } %{{.*}} 1
+ // CHECK: {{.*}} = call { <8 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: {{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 0
+ // CHECK: %{{.*}} = extractvalue { <8 x i32>, i32 } %{{.*}} 1
+ // CHECK: %{{.*}} = call { <16 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x i32>, i32 } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.i32(ptr addrspace(6) %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x i32>, i32 } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <2 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x2.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <2 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <4 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x4.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <4 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <8 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x8.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <8 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <16 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x16.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <16 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <32 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x32.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <32 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <64 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x64.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <64 x float>, float } %{{.*}}, 1
+ // CHECK: %{{.*}} = call { <128 x float>, float } @llvm.nvvm.tcgen05.ld.red.32x32b.x128.f32(ptr addrspace(6) %{{.*}}, i32 1, i1 false, i1 false)
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 0
+ // CHECK: %{{.*}} = extractvalue { <128 x float>, float } %{{.*}}, 1
+
+ %data, %redval = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<2 x i32>, i32
+
+ %data1, %redval1 = nvvm.tcgen05.ld.red %addr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, op = #nvvm.tcgen05_ld_red_op<max>} : vector<4 x i32>, i32
+
+ %data2, %redval2 = nvvm.tcgen05.ld.red %ad...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/177330
More information about the Mlir-commits
mailing list