[Mlir-commits] [mlir] [MLIR][AMDGPU] Add permlane16.var and permlanex16.var intrinsic ops (PR #199501)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed May 27 02:41:17 PDT 2026
https://github.com/stefankoncarevic updated https://github.com/llvm/llvm-project/pull/199501
>From 135156fcd3d19e69923e3dace80a8c99ea7457fc Mon Sep 17 00:00:00 2001
From: Stefan Koncarevic <stefan.koncarevic at amd.com>
Date: Mon, 25 May 2026 04:57:11 -0400
Subject: [PATCH 1/2] [MLIR][AMDGPU] Add permlane16.var and permlanex16.var
intrinsic ops Add ROCDL and AMDGPU dialect support for the GFX12+
variable-selector permlane intrinsics (`v_permlane16_var_b32` /
`v_permlanex16_var_b32`). Unlike the existing fixed-selector
`permlane16`/`permlanex16` ops where source-lane indices come from SGPR
immediates, the "var" variants take per-lane source-lane indices from a VGPR,
enabling arbitrary per-lane intra-row and cross-row permutations within a
wave32 subgroup. ROCDL dialect: - Add `ROCDL_Permlane16VarOp` mapping to
`llvm.amdgcn.permlane16.var` - Add `ROCDL_PermlaneX16VarOp` mapping to
`llvm.amdgcn.permlanex16.var` - Both ops take (old, src0, src1, fi,
boundControl) with fi and boundControl as immediate i1 attributes AMDGPU
dialect: - Add `AMDGPU_PermlaneVarOp` as a high-level wrapper that supports
arbitrary int/float/vector types via automatic decomposition to i32 - `cross`
attribute selects between intra-row (permlane16.var) and cross-row
(permlanex16.var) modes - Lowering uses `LLVM::decomposeValue`/`composeValue`
to repack arbitrary types (f16, i8, vectors, etc.) into i32 register slots
Tests: - ROCDL roundtrip tests (parser/printer) for both ops - ROCDL
translation tests (MLIR -> LLVM IR intrinsic calls) - AMDGPU-to-ROCDL
lowering tests covering i32, f32, f16, vector<4xf16>, cross variants, and
fi/bound_ctrl attributes
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 37 ++++++++
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 63 ++++++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 47 ++++++++++-
.../AMDGPUToROCDL/permlane-var.mlir | 84 +++++++++++++++++++
mlir/test/Dialect/LLVMIR/rocdl.mlir | 20 +++++
mlir/test/Target/LLVMIR/rocdl.mlir | 16 ++++
6 files changed, 266 insertions(+), 1 deletion(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index c833e1a6fc793..00038005565e0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -825,6 +825,43 @@ def AMDGPU_PermlaneSwapOp : AMDGPU_Op<"permlane_swap", [Pure, AllTypesMatch<["re
let hasVerifier = 1;
}
+def AMDGPU_PermlaneVarOp : AMDGPU_Op<"permlane_var",
+ [Pure, AllTypesMatch<["result", "src"]>]> {
+ let summary = "AMDGPU variable-selector permlane op (GFX12+)";
+ let description = [{
+ High-level wrapper on `rocdl.permlane16.var` and `rocdl.permlanex16.var`
+ for per-lane variable-selector permutations within a wave32 subgroup.
+
+ Supports arbitrary int/float/vector types, which will be repacked to i32
+ and one or more ROCDL intrinsic calls during lowering.
+
+ - `cross = false`: intra-row permutation (each lane in a 16-lane row reads
+ from a lane in the same row, selected by `$selector`). Maps to
+ `rocdl.permlane16.var`.
+ - `cross = true`: cross-row permutation (each lane reads from the opposite
+ 16-lane row, selected by `$selector`). Maps to `rocdl.permlanex16.var`.
+
+ `$selector` is an i32 VGPR providing the per-lane source-lane index.
+
+ Example:
+ ```mlir
+ %0 = amdgpu.permlane_var %src, %sel { cross = false } : f16
+ %1 = amdgpu.permlane_var %src, %sel { cross = true } : f32
+ ```
+
+ Note: Lowering is only supported on GFX12+.
+ }];
+ let arguments = (ins AnyIntegerOrFloatOr1DVector:$src,
+ I32:$selector,
+ DefaultValuedAttr<BoolAttr, "false">:$cross,
+ DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
+ DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl);
+ let results = (outs AnyIntegerOrFloatOr1DVector:$result);
+ let assemblyFormat = [{
+ $src `,` $selector attr-dict `:` type($result)
+ }];
+}
+
def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
let summary = "Barrier that includes a wait for LDS memory operations.";
let description = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 14e3a359cab3a..645d6e10b6eca 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -2475,6 +2475,69 @@ def ROCDL_Permlane32SwapOp : ROCDL_IntrOp<"permlane32.swap", [], [],
}];
}
+// Permlane16 variable intrinsic operation (GFX12+)
+//
+// Per-lane variable-selector intra-row (within each 16-lane row) permutation.
+// Unlike `permlane16`, the source-lane indices come from a per-lane VGPR
+// (`$src1`) rather than from immediate SGPR fields, allowing arbitrary
+// per-lane intra-row permutations.
+def ROCDL_Permlane16VarOp : ROCDL_IntrOp<"permlane16.var", [], [],
+ [], 1, 0, 0, 0,
+ [3, 4], ["fi", "boundControl"]>,
+ Arguments<(ins I32:$old, I32:$src0, I32:$src1,
+ I1Attr:$fi, I1Attr:$boundControl)> {
+ let results = (outs I32:$res);
+ let assemblyFormat = [{
+ attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` `(` type($old) `,` type($src0) `)` `->` type($res)
+ }];
+ let description = [{
+ Performs a `permlane16.var` operation: a per-lane variable-selector
+ intra-row permutation (within each 16-lane row). Maps to
+ `llvm.amdgcn.permlane16.var`.
+
+ Each destination lane within a 16-lane row reads its value from the
+ source lane whose index is given by the corresponding per-lane entry
+ in `$src1` (a VGPR). `$fi` and `$boundControl` are immediate i1 attrs
+ matching the underlying intrinsic's modifiers.
+
+ Example:
+ ```mlir
+ %res = rocdl.permlane16.var %old, %src, %selector, false, true : (i32, i32) -> i32
+ ```
+ }];
+}
+
+// PermlaneX16 variable intrinsic operation (GFX12+)
+//
+// Per-lane variable-selector cross-row permutation (each lane in row R reads
+// from a per-lane-chosen source lane in the opposite 16-lane row).
+def ROCDL_PermlaneX16VarOp : ROCDL_IntrOp<"permlanex16.var", [], [],
+ [], 1, 0, 0, 0,
+ [3, 4], ["fi", "boundControl"]>,
+ Arguments<(ins I32:$old, I32:$src0, I32:$src1,
+ I1Attr:$fi, I1Attr:$boundControl)> {
+ let results = (outs I32:$res);
+ let assemblyFormat = [{
+ attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` `(` type($old) `,` type($src0) `)` `->` type($res)
+ }];
+ let description = [{
+ Performs a `permlanex16.var` operation: a per-lane variable-selector
+ cross-row permutation (each lane in one 16-lane row reads from a
+ per-lane-chosen source lane in the opposite row). Maps to
+ `llvm.amdgcn.permlanex16.var`.
+
+ With per-lane "identity" selectors this realises an XOR-16 swap pattern
+ in pure VALU; with arbitrary selectors it enables general per-lane
+ cross-half-wave permutations. `$fi` and `$boundControl` are immediate
+ i1 attrs matching the underlying intrinsic's modifiers.
+
+ Example:
+ ```mlir
+ %res = rocdl.permlanex16.var %old, %src, %selector, false, true : (i32, i32) -> i32
+ ```
+ }];
+}
+
//===---------------------------------------------------------------------===//
// 16-bit float intrinsics
//===---------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 509a36e0b2716..48bd89b3a2fb7 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3291,6 +3291,51 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
}
};
+struct AMDGPUPermlaneVarLowering
+ : public ConvertOpToLLVMPattern<PermlaneVarOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUPermlaneVarLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<PermlaneVarOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(PermlaneVarOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1200)
+ return op->emitOpError("permlane_var is only supported on GFX12+");
+
+ Location loc = op.getLoc();
+ Type i32 = rewriter.getI32Type();
+ Value src = adaptor.getSrc();
+ Value selector = adaptor.getSelector();
+ bool cross = op.getCross();
+ bool fi = op.getFetchInactive();
+ bool boundCtrl = op.getBoundCtrl();
+
+ SmallVector<Value> decomposed;
+ if (failed(LLVM::decomposeValue(rewriter, loc, src, i32, decomposed)))
+ return rewriter.notifyMatchFailure(op,
+ "failed to decompose value to i32");
+
+ SmallVector<Value> permuted;
+ for (Value v : decomposed) {
+ Value res;
+ if (cross)
+ res = ROCDL::PermlaneX16VarOp::create(rewriter, loc, i32, v, v,
+ selector, fi, boundCtrl);
+ else
+ res = ROCDL::Permlane16VarOp::create(rewriter, loc, i32, v, v, selector,
+ fi, boundCtrl);
+ permuted.emplace_back(res);
+ }
+
+ Value result = LLVM::composeValue(rewriter, loc, permuted, src.getType());
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+};
+
//===----------------------------------------------------------------------===//
// In-LDS Barrier Operations
//===----------------------------------------------------------------------===//
@@ -4559,7 +4604,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
GlobalLoadAsyncToLDSOpLowering, TransposeLoadOpLowering,
GlobalTransposeLoadOpLowering, AMDGPUPermlaneLowering,
- AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
+ AMDGPUPermlaneVarLowering, AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
new file mode 100644
index 0000000000000..7e1c8cce824c5
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
@@ -0,0 +1,84 @@
+// RUN: mlir-opt --convert-amdgpu-to-rocdl=chipset=gfx1200 --canonicalize %s | FileCheck %s
+
+// CHECK-LABEL: func @test_permlane_var_i32
+// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_i32(%src : i32, %sel : i32) -> i32 {
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: return %[[RES]] : i32
+ %0 = amdgpu.permlane_var %src, %sel : i32
+ return %0 : i32
+}
+
+// CHECK-LABEL: func @test_permlane_var_cross_i32
+// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_cross_i32(%src : i32, %sel : i32) -> i32 {
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: return %[[RES]] : i32
+ %0 = amdgpu.permlane_var %src, %sel { cross = true } : i32
+ return %0 : i32
+}
+
+// CHECK-LABEL: func @test_permlane_var_f32
+// CHECK-SAME: (%[[SRC:.*]]: f32, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_f32(%src : f32, %sel : i32) -> f32 {
+// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f32 to i32
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[CAST]], %[[CAST]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
+// CHECK: return %[[RES_CAST]] : f32
+ %0 = amdgpu.permlane_var %src, %sel : f32
+ return %0 : f32
+}
+
+// CHECK-LABEL: func @test_permlane_var_f16
+// CHECK-SAME: (%[[SRC:.*]]: f16, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_f16(%src : f16, %sel : i32) -> f16 {
+// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f16 to i16
+// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
+// CHECK: return %[[RES_CAST]] : f16
+ %0 = amdgpu.permlane_var %src, %sel : f16
+ return %0 : f16
+}
+
+// CHECK-LABEL: func @test_permlane_var_cross_f16
+// CHECK-SAME: (%[[SRC:.*]]: f16, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_cross_f16(%src : f16, %sel : i32) -> f16 {
+// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f16 to i16
+// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
+// CHECK: return %[[RES_CAST]] : f16
+ %0 = amdgpu.permlane_var %src, %sel { cross = true } : f16
+ return %0 : f16
+}
+
+// CHECK-LABEL: func @test_permlane_var_4xf16
+// CHECK-SAME: (%[[SRC:.*]]: vector<4xf16>, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_4xf16(%src : vector<4xf16>, %sel : i32) -> vector<4xf16> {
+// CHECK-DAG: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : vector<4xf16> to vector<2xi32>
+// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[CAST]][%[[C0]] : i32] : vector<2xi32>
+// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[CAST]][%[[C1]] : i32] : vector<2xi32>
+// CHECK: %[[PERM0:.*]] = rocdl.permlane16.var %[[ELEM0]], %[[ELEM0]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[PERM1:.*]] = rocdl.permlane16.var %[[ELEM1]], %[[ELEM1]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[INS0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK: %[[INS1:.*]] = llvm.insertelement %[[PERM1]], %[[INS0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK: %[[RES:.*]] = llvm.bitcast %[[INS1]] : vector<2xi32> to vector<4xf16>
+// CHECK: return %[[RES]] : vector<4xf16>
+ %0 = amdgpu.permlane_var %src, %sel : vector<4xf16>
+ return %0 : vector<4xf16>
+}
+
+// CHECK-LABEL: func @test_permlane_var_attrs
+// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
+func.func @test_permlane_var_attrs(%src : i32, %sel : i32) -> i32 {
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], true, true : (i32, i32) -> i32
+// CHECK: return %[[RES]] : i32
+ %0 = amdgpu.permlane_var %src, %sel { cross = true, fetch_inactive = true, bound_ctrl = true } : i32
+ return %0 : i32
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index db1cefe86dfc6..2fa735c0670c9 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1335,6 +1335,26 @@ llvm.func @rocdl.permlane32.swap(%src : i32) -> !llvm.struct<(i32, i32)> {
// -----
+llvm.func @rocdl.permlane16.var(%src : i32) -> i32 {
+ %sel = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK-LABEL: rocdl.permlane16.var
+ // CHECK: rocdl.permlane16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32) -> i32
+ %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ llvm.return %ret : i32
+}
+
+// -----
+
+llvm.func @rocdl.permlanex16.var(%src : i32) -> i32 {
+ %sel = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK-LABEL: rocdl.permlanex16.var
+ // CHECK: rocdl.permlanex16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32) -> i32
+ %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ llvm.return %ret : i32
+}
+
+// -----
+
// CHECK-LABEL: rocdl.cvt.scale.pk8
llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 9e1d98b8422da..dfc2e2b7f9379 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1464,6 +1464,22 @@ llvm.func @rocdl.permlane32.swap(%src : i32) -> !llvm.struct<(i32, i32)> {
llvm.return %ret : !llvm.struct<(i32, i32)>
}
+llvm.func @rocdl.permlane16.var(%src : i32) -> i32 {
+ %sel = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK-LABEL: rocdl.permlane16.var
+ // CHECK: call i32 @llvm.amdgcn.permlane16.var(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i1 false, i1 true)
+ %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ llvm.return %ret : i32
+}
+
+llvm.func @rocdl.permlanex16.var(%src : i32) -> i32 {
+ %sel = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK-LABEL: rocdl.permlanex16.var
+ // CHECK: call i32 @llvm.amdgcn.permlanex16.var(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i1 false, i1 true)
+ %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ llvm.return %ret : i32
+}
+
llvm.func @rocdl.wmma.fp8(%arg0 : vector<2 x i32>, %arg1 : vector<8xf32>) -> vector<8xf32> {
// CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}})
%r0 = rocdl.wmma.f32.16x16x16.fp8_fp8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
>From 2a15c33fb1ea0d28f0af12db5d80a98215cb44f6 Mon Sep 17 00:00:00 2001
From: Stefan Koncarevic <stefan.koncarevic at amd.com>
Date: Wed, 27 May 2026 04:23:30 -0400
Subject: [PATCH 2/2] Use functional-type in permlane var op assembly format
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 ++++----
.../Conversion/AMDGPUToROCDL/permlane-var.mlir | 16 ++++++++--------
mlir/test/Dialect/LLVMIR/rocdl.mlir | 8 ++++----
mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++--
4 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 645d6e10b6eca..1149725a24c50 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -2488,7 +2488,7 @@ def ROCDL_Permlane16VarOp : ROCDL_IntrOp<"permlane16.var", [], [],
I1Attr:$fi, I1Attr:$boundControl)> {
let results = (outs I32:$res);
let assemblyFormat = [{
- attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` `(` type($old) `,` type($src0) `)` `->` type($res)
+ attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` functional-type(operands, $res)
}];
let description = [{
Performs a `permlane16.var` operation: a per-lane variable-selector
@@ -2502,7 +2502,7 @@ def ROCDL_Permlane16VarOp : ROCDL_IntrOp<"permlane16.var", [], [],
Example:
```mlir
- %res = rocdl.permlane16.var %old, %src, %selector, false, true : (i32, i32) -> i32
+ %res = rocdl.permlane16.var %old, %src, %selector, false, true : (i32, i32, i32) -> i32
```
}];
}
@@ -2518,7 +2518,7 @@ def ROCDL_PermlaneX16VarOp : ROCDL_IntrOp<"permlanex16.var", [], [],
I1Attr:$fi, I1Attr:$boundControl)> {
let results = (outs I32:$res);
let assemblyFormat = [{
- attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` `(` type($old) `,` type($src0) `)` `->` type($res)
+ attr-dict $old `,` $src0 `,` $src1 `,` $fi `,` $boundControl `:` functional-type(operands, $res)
}];
let description = [{
Performs a `permlanex16.var` operation: a per-lane variable-selector
@@ -2533,7 +2533,7 @@ def ROCDL_PermlaneX16VarOp : ROCDL_IntrOp<"permlanex16.var", [], [],
Example:
```mlir
- %res = rocdl.permlanex16.var %old, %src, %selector, false, true : (i32, i32) -> i32
+ %res = rocdl.permlanex16.var %old, %src, %selector, false, true : (i32, i32, i32) -> i32
```
}];
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
index 7e1c8cce824c5..ddbffcc5a8cb9 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane-var.mlir
@@ -3,7 +3,7 @@
// CHECK-LABEL: func @test_permlane_var_i32
// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
func.func @test_permlane_var_i32(%src : i32, %sel : i32) -> i32 {
-// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: return %[[RES]] : i32
%0 = amdgpu.permlane_var %src, %sel : i32
return %0 : i32
@@ -12,7 +12,7 @@ func.func @test_permlane_var_i32(%src : i32, %sel : i32) -> i32 {
// CHECK-LABEL: func @test_permlane_var_cross_i32
// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
func.func @test_permlane_var_cross_i32(%src : i32, %sel : i32) -> i32 {
-// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: return %[[RES]] : i32
%0 = amdgpu.permlane_var %src, %sel { cross = true } : i32
return %0 : i32
@@ -22,7 +22,7 @@ func.func @test_permlane_var_cross_i32(%src : i32, %sel : i32) -> i32 {
// CHECK-SAME: (%[[SRC:.*]]: f32, %[[SEL:.*]]: i32)
func.func @test_permlane_var_f32(%src : f32, %sel : i32) -> f32 {
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f32 to i32
-// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[CAST]], %[[CAST]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[CAST]], %[[CAST]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
// CHECK: return %[[RES_CAST]] : f32
%0 = amdgpu.permlane_var %src, %sel : f32
@@ -34,7 +34,7 @@ func.func @test_permlane_var_f32(%src : f32, %sel : i32) -> f32 {
func.func @test_permlane_var_f16(%src : f16, %sel : i32) -> f16 {
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f16 to i16
// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
-// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlane16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
// CHECK: return %[[RES_CAST]] : f16
@@ -47,7 +47,7 @@ func.func @test_permlane_var_f16(%src : f16, %sel : i32) -> f16 {
func.func @test_permlane_var_cross_f16(%src : f16, %sel : i32) -> f16 {
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : f16 to i16
// CHECK: %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
-// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[ZEXT]], %[[ZEXT]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
// CHECK: %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
// CHECK: return %[[RES_CAST]] : f16
@@ -64,8 +64,8 @@ func.func @test_permlane_var_4xf16(%src : vector<4xf16>, %sel : i32) -> vector<4
// CHECK: %[[CAST:.*]] = llvm.bitcast %[[SRC]] : vector<4xf16> to vector<2xi32>
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[CAST]][%[[C0]] : i32] : vector<2xi32>
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[CAST]][%[[C1]] : i32] : vector<2xi32>
-// CHECK: %[[PERM0:.*]] = rocdl.permlane16.var %[[ELEM0]], %[[ELEM0]], %[[SEL]], false, false : (i32, i32) -> i32
-// CHECK: %[[PERM1:.*]] = rocdl.permlane16.var %[[ELEM1]], %[[ELEM1]], %[[SEL]], false, false : (i32, i32) -> i32
+// CHECK: %[[PERM0:.*]] = rocdl.permlane16.var %[[ELEM0]], %[[ELEM0]], %[[SEL]], false, false : (i32, i32, i32) -> i32
+// CHECK: %[[PERM1:.*]] = rocdl.permlane16.var %[[ELEM1]], %[[ELEM1]], %[[SEL]], false, false : (i32, i32, i32) -> i32
// CHECK: %[[INS0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
// CHECK: %[[INS1:.*]] = llvm.insertelement %[[PERM1]], %[[INS0]][%[[C1]] : i32] : vector<2xi32>
// CHECK: %[[RES:.*]] = llvm.bitcast %[[INS1]] : vector<2xi32> to vector<4xf16>
@@ -77,7 +77,7 @@ func.func @test_permlane_var_4xf16(%src : vector<4xf16>, %sel : i32) -> vector<4
// CHECK-LABEL: func @test_permlane_var_attrs
// CHECK-SAME: (%[[SRC:.*]]: i32, %[[SEL:.*]]: i32)
func.func @test_permlane_var_attrs(%src : i32, %sel : i32) -> i32 {
-// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], true, true : (i32, i32) -> i32
+// CHECK: %[[RES:.*]] = rocdl.permlanex16.var %[[SRC]], %[[SRC]], %[[SEL]], true, true : (i32, i32, i32) -> i32
// CHECK: return %[[RES]] : i32
%0 = amdgpu.permlane_var %src, %sel { cross = true, fetch_inactive = true, bound_ctrl = true } : i32
return %0 : i32
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 2fa735c0670c9..5273c955c0121 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1338,8 +1338,8 @@ llvm.func @rocdl.permlane32.swap(%src : i32) -> !llvm.struct<(i32, i32)> {
llvm.func @rocdl.permlane16.var(%src : i32) -> i32 {
%sel = llvm.mlir.constant(-1 : i32) : i32
// CHECK-LABEL: rocdl.permlane16.var
- // CHECK: rocdl.permlane16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32) -> i32
- %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ // CHECK: rocdl.permlane16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32, i32) -> i32
+ %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32, i32) -> i32
llvm.return %ret : i32
}
@@ -1348,8 +1348,8 @@ llvm.func @rocdl.permlane16.var(%src : i32) -> i32 {
llvm.func @rocdl.permlanex16.var(%src : i32) -> i32 {
%sel = llvm.mlir.constant(-1 : i32) : i32
// CHECK-LABEL: rocdl.permlanex16.var
- // CHECK: rocdl.permlanex16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32) -> i32
- %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ // CHECK: rocdl.permlanex16.var %{{.*}}, %{{.*}}, %{{.*}}, false, true : (i32, i32, i32) -> i32
+ %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32, i32) -> i32
llvm.return %ret : i32
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index dfc2e2b7f9379..29e81c8208243 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1468,7 +1468,7 @@ llvm.func @rocdl.permlane16.var(%src : i32) -> i32 {
%sel = llvm.mlir.constant(-1 : i32) : i32
// CHECK-LABEL: rocdl.permlane16.var
// CHECK: call i32 @llvm.amdgcn.permlane16.var(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i1 false, i1 true)
- %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ %ret = rocdl.permlane16.var %src, %src, %sel, false, true : (i32, i32, i32) -> i32
llvm.return %ret : i32
}
@@ -1476,7 +1476,7 @@ llvm.func @rocdl.permlanex16.var(%src : i32) -> i32 {
%sel = llvm.mlir.constant(-1 : i32) : i32
// CHECK-LABEL: rocdl.permlanex16.var
// CHECK: call i32 @llvm.amdgcn.permlanex16.var(i32 %{{.*}}, i32 %{{.*}}, i32 -1, i1 false, i1 true)
- %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32) -> i32
+ %ret = rocdl.permlanex16.var %src, %src, %sel, false, true : (i32, i32, i32) -> i32
llvm.return %ret : i32
}
More information about the Mlir-commits
mailing list