[Mlir-commits] [mlir] [mlir][AMDGPU] Add PermlaneOp (PR #154345)

Thu Aug 21 08:13:05 PDT 2025

https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/154345

>From 5bcf5275f79747b4dcbb91491358904556d7d6e4 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Mon, 18 Aug 2025 13:18:00 +0000
Subject: [PATCH 01/12] add AMDGPU PermlaneOp

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  29 ++++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  52 +++++-
 .../Conversion/AMDGPUToROCDL/permlane.mlir    | 153 ++++++++++++++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             |  14 ++
 4 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 2c646934c11c2..17386a2888788 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -656,6 +656,35 @@ def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
   }];
 }
 
+def AMDGPU_PermlanePerm : I32EnumAttr<"PermlanePerm",
+    "The possible permutations for a permlane operation",
+    [
+      I32EnumAttrCase<"swap_16",  0>,
+      I32EnumAttrCase<"swap_32",  1>,
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
+  "permlane_perm">;
+
+def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]>,
+Arguments<(ins AnyIntegerOrFloatOr1DVector:$src, 
+                AMDGPU_PermlanePermAttr:$kind)> {
+  let summary = "AMDGPU permlane op";
+  let description = [{
+    High-level wrapper on `rocdl.permlane` variants.
+
+    Supports arbitrary int/float/vector types, which will be repacked to i32 and
+    one or more `rocdl.permlane` ops during lowering.
+  }];
+  let results = (outs AnyIntegerOrFloatOr1DVector:$result);
+  let assemblyFormat = [{
+    $src $kind attr-dict `:` type($result)
+  }];
+}
+
 def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
   let summary = "Barrier that includes a wait for LDS memory operations.";
   let description = [{
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 64720bfe6cf50..4b0f8bcb40811 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -1876,6 +1877,55 @@ struct AMDGPUSwizzleBitModeLowering
   }
 };
 
+struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  AMDGPUPermlaneLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<PermlaneOp>(converter), chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(PermlaneOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Type i32 = rewriter.getI32Type();
+    Value src = adaptor.getSrc();
+    auto kind = op.getKind();
+    auto fi = rewriter.getBoolAttr(false);
+    auto boundctrl = rewriter.getBoolAttr(false);
+
+    if (chipset < kGfx950)
+      return op->emitOpError("permlane_swap is only supported on gfx950+");
+
+    SmallVector<Value> decomposed =
+        LLVM::decomposeValue(rewriter, loc, src, i32);
+
+    SmallVector<Value> permuted;
+    for (Value v : decomposed) {
+      Value res;
+      Type i32pair = LLVM::LLVMStructType::getLiteral(
+          rewriter.getContext(), {v.getType(), v.getType()});
+      switch (kind) {
+      case PermlanePerm::swap_16:
+        res = ROCDL::Permlane16SwapOp::create(rewriter, loc, i32pair, v, v, fi,
+                                              boundctrl);
+        break;
+      case PermlanePerm::swap_32:
+        res = ROCDL::Permlane32SwapOp::create(rewriter, loc, i32pair, v, v, fi,
+                                              boundctrl);
+        break;
+      }
+
+      Value vdstNew = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
+      permuted.emplace_back(vdstNew);
+    }
+
+    Value result = LLVM::composeValue(rewriter, loc, permuted, src.getType());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 struct ConvertAMDGPUToROCDLPass
     : public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
   using Base::Base;
@@ -1944,6 +1994,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
-           TransposeLoadOpLowering>(converter, chipset);
+           TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
new file mode 100644
index 0000000000000..5b216e393043f
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -0,0 +1,153 @@
+// RUN: mlir-opt -convert-amdgpu-to-rocdl=chipset=gfx950 --canonicalize %s | FileCheck %s
+
+// CHECK-LABEL: func @test_permlane16_i32
+// CHECK-SAME: (%[[ARG0:.*]]: i32)
+func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
+// CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  return %[[RES]] : i32
+  %0 = amdgpu.permlane %arg0 swap_16 : i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @test_permlane32_i32
+// CHECK-SAME: (%[[ARG0:.*]]: i32)
+func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
+// CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  return %[[RES]] : i32
+  %0 = amdgpu.permlane %arg0 swap_32 : i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: func @test_permlane16_f32
+// CHECK-SAME: (%[[ARG0:.*]]: f32)
+func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
+// CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
+// CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
+// CHECK:  return %[[RES_CAST]] : f32
+  %0 = amdgpu.permlane %arg0 swap_16 : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @test_permlane32_f32
+// CHECK-SAME: (%[[ARG0:.*]]: f32)
+func.func @test_permlane32_f32(%arg0 : f32) -> f32 {
+// CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f32 to i32
+// CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[CAST]], %[[CAST]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
+// CHECK:  return %[[RES_CAST]] : f32
+  %0 = amdgpu.permlane %arg0 swap_32 : f32
+  return %0 : f32
+}
+
+// CHECK-LABEL: func @test_permlane16_f16
+// CHECK-SAME: (%[[ARG0:.*]]: f16)
+func.func @test_permlane16_f16(%arg0 : f16) -> f16 {
+// CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
+// CHECK:  %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
+// CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
+// CHECK:  return %[[RES_CAST]] : f16
+  %0 = amdgpu.permlane %arg0 swap_16 : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: func @test_permlane32_f16
+// CHECK-SAME: (%[[ARG0:.*]]: f16)
+func.func @test_permlane32_f16(%arg0 : f16) -> f16 {
+// CHECK:  %[[CAST:.*]] = llvm.bitcast %[[ARG0]] : f16 to i16
+// CHECK:  %[[ZEXT:.*]] = llvm.zext %[[CAST]] : i16 to i32
+// CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[ZEXT]], %[[ZEXT]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
+// CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
+// CHECK:  return %[[RES_CAST]] : f16
+  %0 = amdgpu.permlane %arg0 swap_32 : f16
+  return %0 : f16
+}
+
+// CHECK-LABEL: func @test_permlane16_2xi32
+// CHECK-SAME: (%[[ARG0:.*]]: vector<2xi32>)
+func.func @test_permlane16_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
+// CHECK-DAG:  %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK-DAG:  %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
+  %0 = amdgpu.permlane %arg0 swap_16 : vector<2xi32>
+  return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @test_permlane32_2xi32
+// CHECK-SAME: (%[[ARG0:.*]]: vector<2xi32>)
+func.func @test_permlane32_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
+// CHECK-DAG:  %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK-DAG:  %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[ARG0]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[ARG0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
+  %0 = amdgpu.permlane %arg0 swap_32 : vector<2xi32>
+  return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @test_permlane16_4xf16
+// CHECK-SAME: (%[[ARG0:.*]]: vector<4xf16>)
+func.func @test_permlane16_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
+// CHECK-DAG:  %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK-DAG:  %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:      %[[CAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<4xf16> to vector<2xi32>
+// CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane16.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
+// CHECK:      return %[[CAST2]] : vector<4xf16>
+  %0 = amdgpu.permlane %arg0 swap_16 : vector<4xf16>
+  return %0 : vector<4xf16>
+}
+
+// CHECK-LABEL: func @test_permlane32_4xf16
+// CHECK-SAME: (%[[ARG0:.*]]: vector<4xf16>)
+func.func @test_permlane32_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
+// CHECK-DAG:  %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK-DAG:  %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK-DAG:  %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:      %[[CAST1:.*]] = llvm.bitcast %[[ARG0]] : vector<4xf16> to vector<2xi32>
+// CHECK:      %[[ELEM0:.*]] = llvm.extractelement %[[CAST1]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[ELEM1:.*]] = llvm.extractelement %[[CAST1]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[PERM0_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM0]], %[[ELEM0]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM0:.*]] = llvm.extractvalue %[[PERM0_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[PERM1_TUPLE:.*]] = rocdl.permlane32.swap %[[ELEM1]], %[[ELEM1]], false, false : (i32, i32) -> <(i32, i32)>
+// CHECK:      %[[PERM1:.*]] = llvm.extractvalue %[[PERM1_TUPLE]][0] : !llvm.struct<(i32, i32)>
+// CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
+// CHECK:      return %[[CAST2]] : vector<4xf16>
+  %0 = amdgpu.permlane %arg0 swap_32 : vector<4xf16>
+  return %0 : vector<4xf16>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 87e11c028c62a..5eb6d53d4d9fe 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -524,6 +524,20 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   func.return %0 : f32
 }
 
+// CHECK-LABEL: func @permlane16_swap
+func.func @permlane16_swap(%arg0 : f32) -> f32 {
+  // CHECK: amdgpu.permlane
+  %0 = amdgpu.permlane %arg0 swap_16 : f32
+  func.return %0 : f32
+}
+
+// CHECK-LABEL: func @permlane32_swap
+func.func @permlane32_swap(%arg0 : f32) -> f32 {
+  // CHECK: amdgpu.permlane
+  %0 = amdgpu.permlane %arg0 swap_32 : f32
+  func.return %0 : f32
+}
+
 // CHECK-LABEL: func @scaled_mfma
 func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
   // CHECK: amdgpu.scaled_mfma

>From e3a87752e8c64ef50d000e0b37b02b5a830b83f2 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 19 Aug 2025 10:34:19 -0500
Subject: [PATCH 02/12] expose boundCtrl and fi

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td       |  4 +++-
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp |  4 ++--
 mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir    | 10 ++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 17386a2888788..019da46246841 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -671,7 +671,9 @@ def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
 
 def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]>,
 Arguments<(ins AnyIntegerOrFloatOr1DVector:$src, 
-                AMDGPU_PermlanePermAttr:$kind)> {
+               AMDGPU_PermlanePermAttr:$kind,
+               DefaultValuedAttr<BoolAttr, "false">:$fi,
+               DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl)> {
   let summary = "AMDGPU permlane op";
   let description = [{
     High-level wrapper on `rocdl.permlane` variants.
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4b0f8bcb40811..1f5c6bec92c42 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1891,8 +1891,8 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
     Type i32 = rewriter.getI32Type();
     Value src = adaptor.getSrc();
     auto kind = op.getKind();
-    auto fi = rewriter.getBoolAttr(false);
-    auto boundctrl = rewriter.getBoolAttr(false);
+    auto fi = op.getFi();
+    auto boundctrl = op.getBoundCtrl();
 
     if (chipset < kGfx950)
       return op->emitOpError("permlane_swap is only supported on gfx950+");
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
index 5b216e393043f..81b98c4e6bc3e 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -10,6 +10,16 @@ func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
   return %0 : i32
 }
 
+// CHECK-LABEL: func @test_permlane16_i32_optional_attr
+// CHECK-SAME: (%[[ARG0:.*]]: i32)
+func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
+// CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
+// CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
+// CHECK:  return %[[RES]] : i32
+  %0 = amdgpu.permlane %arg0 swap_16 { fi = true, bound_ctrl = true }  : i32
+  return %0 : i32
+}
+
 // CHECK-LABEL: func @test_permlane32_i32
 // CHECK-SAME: (%[[ARG0:.*]]: i32)
 func.func @test_permlane32_i32(%arg0 : i32) -> i32 {

>From 6d3ab9c076cece1b871f74c77eff16cefc418ee3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 04:07:07 -0500
Subject: [PATCH 03/12] rename fi and replace auto with types

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++--
 mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 1f5c6bec92c42..66ade9eae4319 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1891,8 +1891,8 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
     Type i32 = rewriter.getI32Type();
     Value src = adaptor.getSrc();
     auto kind = op.getKind();
-    auto fi = op.getFi();
-    auto boundctrl = op.getBoundCtrl();
+    bool fi = op.getFetchInactive();
+    bool boundctrl = op.getBoundCtrl();
 
     if (chipset < kGfx950)
       return op->emitOpError("permlane_swap is only supported on gfx950+");
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
index 81b98c4e6bc3e..d341afa8b9077 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -16,7 +16,7 @@ func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  return %[[RES]] : i32
-  %0 = amdgpu.permlane %arg0 swap_16 { fi = true, bound_ctrl = true }  : i32
+  %0 = amdgpu.permlane %arg0 swap_16 { fetch_inactive = true, bound_ctrl = true }  : i32
   return %0 : i32
 }
 

>From b37ec370f553b2318c3ac15c437e90e809ce5cb9 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 04:07:27 -0500
Subject: [PATCH 04/12] improve docs

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 019da46246841..2ccf6b4f46df6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -672,14 +672,37 @@ def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
 def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]>,
 Arguments<(ins AnyIntegerOrFloatOr1DVector:$src, 
                AMDGPU_PermlanePermAttr:$kind,
-               DefaultValuedAttr<BoolAttr, "false">:$fi,
+               DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
                DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl)> {
   let summary = "AMDGPU permlane op";
   let description = [{
-    High-level wrapper on `rocdl.permlane` variants.
+    High-level wrapper on `rocdl.permlane.*` variants.
 
     Supports arbitrary int/float/vector types, which will be repacked to i32 and
-    one or more `rocdl.permlane` ops during lowering.
+    one or more `rocdl.permlane.*` ops during lowering.
+    The following lane permutations are supported:
+    - Swap the data between odd and even rows of 16 lanes (`swap_16`)
+    - Swap the data between the first 32 lanes and the last 32 lanes (`swap_32`)
+
+    Format example:
+    ```
+    %0 = amdgpu.permlane %src swap_16 : f16
+    %1 = amdgpu.permlane %src swap_32 { fetch_inactive = true, bound_ctrl = true } : f16
+    ```
+
+    Operands:
+    * `$src`: Vector register to permute across lanes
+    * `$kind`: The kind of permutation operation.
+    * `$fetch_inactive`: Optional. Used to dertermine behavior of invalid lanes (disabled thread or out-of-range).
+      `fetch_inactive = false`: If source lane is invalid, use `bound_ctrl` to determine the source value.
+      `fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway
+      (ignoring `bound_ctrl`). If the source lane is out-of-range, behavior is decided by `bound_ctrl`.
+    * `$bound_ctrl`: Optional. Used to determine what a thread should do if its source operand is from 
+      a disabled thread or invalid input: use the value zero, or disable the write.
+      `bound_ctrl = false`: Do not write when source is invalid or out-of-range.
+      `bound_ctrl = true`: Use zero as input if source is invalid or out-of-range
+
+    Note: Lowering is only supported on gfx950 and up.
   }];
   let results = (outs AnyIntegerOrFloatOr1DVector:$result);
   let assemblyFormat = [{

>From 2be450e2db6053e72b46b0383cbcef0df4523eb1 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 04:08:53 -0500
Subject: [PATCH 05/12] fix --

---
 mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
index d341afa8b9077..fa0049274d9cf 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-amdgpu-to-rocdl=chipset=gfx950 --canonicalize %s | FileCheck %s
+// RUN: mlir-opt --convert-amdgpu-to-rocdl=chipset=gfx950 --canonicalize %s | FileCheck %s
 
 // CHECK-LABEL: func @test_permlane16_i32
 // CHECK-SAME: (%[[ARG0:.*]]: i32)

>From c1dbe0e2106d9210d1782833fce670152778cec3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Wed, 20 Aug 2025 04:09:44 -0500
Subject: [PATCH 06/12] move chipset check

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 66ade9eae4319..605ab4ec757d1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1887,6 +1887,9 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
   LogicalResult
   matchAndRewrite(PermlaneOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (chipset < kGfx950)
+      return op->emitOpError("permlane_swap is only supported on gfx950+");
+
     Location loc = op.getLoc();
     Type i32 = rewriter.getI32Type();
     Value src = adaptor.getSrc();
@@ -1894,9 +1897,6 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
     bool fi = op.getFetchInactive();
     bool boundctrl = op.getBoundCtrl();
 
-    if (chipset < kGfx950)
-      return op->emitOpError("permlane_swap is only supported on gfx950+");
-
     SmallVector<Value> decomposed =
         LLVM::decomposeValue(rewriter, loc, src, i32);
 

>From bd606d9ee557113e7636d49d3771728afade164e Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 10:56:20 +0200
Subject: [PATCH 07/12] Update mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Co-authored-by: Jakub Kuderski <kubakuderski at gmail.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 2ccf6b4f46df6..16d51ddc26969 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -691,7 +691,7 @@ Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
     ```
 
     Operands:
-    * `$src`: Vector register to permute across lanes
+    * `$src`: Vector register to permute across lanes of the subgroup.
     * `$kind`: The kind of permutation operation.
     * `$fetch_inactive`: Optional. Used to dertermine behavior of invalid lanes (disabled thread or out-of-range).
       `fetch_inactive = false`: If source lane is invalid, use `bound_ctrl` to determine the source value.

>From b74a108378ed10bcac6911e275b9788a22c07d1e Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 10:56:39 +0200
Subject: [PATCH 08/12] Update mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Co-authored-by: Jakub Kuderski <kubakuderski at gmail.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 16d51ddc26969..f93ca82f85bb9 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -684,8 +684,8 @@ Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
     - Swap the data between odd and even rows of 16 lanes (`swap_16`)
     - Swap the data between the first 32 lanes and the last 32 lanes (`swap_32`)
 
-    Format example:
-    ```
+    Example:
+    ```mlir
     %0 = amdgpu.permlane %src swap_16 : f16
     %1 = amdgpu.permlane %src swap_32 { fetch_inactive = true, bound_ctrl = true } : f16
     ```

>From 362fddab556b5ab7b82a445fd3283b8ed8382e3a Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 10:56:54 +0200
Subject: [PATCH 09/12] Update mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Co-authored-by: Jakub Kuderski <kubakuderski at gmail.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index f93ca82f85bb9..d6cb391a3c7a7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -680,7 +680,7 @@ Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
 
     Supports arbitrary int/float/vector types, which will be repacked to i32 and
     one or more `rocdl.permlane.*` ops during lowering.
-    The following lane permutations are supported:
+    Supported lane permutations:
     - Swap the data between odd and even rows of 16 lanes (`swap_16`)
     - Swap the data between the first 32 lanes and the last 32 lanes (`swap_32`)
 

>From a383b094da4b9cc85f97a92a151e918876085122 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 05:13:17 -0500
Subject: [PATCH 10/12] let arguments

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index d6cb391a3c7a7..9a20f44058253 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -669,11 +669,7 @@ def AMDGPU_PermlanePerm : I32EnumAttr<"PermlanePerm",
 def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
   "permlane_perm">;
 
-def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]>,
-Arguments<(ins AnyIntegerOrFloatOr1DVector:$src, 
-               AMDGPU_PermlanePermAttr:$kind,
-               DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
-               DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl)> {
+def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]> {
   let summary = "AMDGPU permlane op";
   let description = [{
     High-level wrapper on `rocdl.permlane.*` variants.
@@ -704,6 +700,10 @@ Arguments<(ins AnyIntegerOrFloatOr1DVector:$src,
 
     Note: Lowering is only supported on gfx950 and up.
   }];
+  let arguments = (ins AnyIntegerOrFloatOr1DVector:$src,
+                       AMDGPU_PermlanePermAttr:$kind,
+                       DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
+                       DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl);
   let results = (outs AnyIntegerOrFloatOr1DVector:$result);
   let assemblyFormat = [{
     $src $kind attr-dict `:` type($result)

>From 90e65a5df1c66651e79d12196ca88bb1f3db447a Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 05:13:35 -0500
Subject: [PATCH 11/12] simplify docs

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 9a20f44058253..0a544164fb71b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -672,7 +672,8 @@ def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
 def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]> {
   let summary = "AMDGPU permlane op";
   let description = [{
-    High-level wrapper on `rocdl.permlane.*` variants.
+    High-level wrapper on `rocdl.permlane.*` variants for permutations
+     on rows of lanes in a subgroup.
 
     Supports arbitrary int/float/vector types, which will be repacked to i32 and
     one or more `rocdl.permlane.*` ops during lowering.
@@ -689,14 +690,13 @@ def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "s
     Operands:
     * `$src`: Vector register to permute across lanes of the subgroup.
     * `$kind`: The kind of permutation operation.
-    * `$fetch_inactive`: Optional. Used to dertermine behavior of invalid lanes (disabled thread or out-of-range).
-      `fetch_inactive = false`: If source lane is invalid, use `bound_ctrl` to determine the source value.
-      `fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway
-      (ignoring `bound_ctrl`). If the source lane is out-of-range, behavior is decided by `bound_ctrl`.
+    * `$fetch_inactive`: Optional. Used to dertermine behavior of a fetch from a disabled lane.
+      `fetch_inactive = false`: If the source lane is disabled, use `bound_ctrl` to determine the source value.
+      `fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway (ignoring `bound_ctrl`).
     * `$bound_ctrl`: Optional. Used to determine what a thread should do if its source operand is from 
-      a disabled thread or invalid input: use the value zero, or disable the write.
-      `bound_ctrl = false`: Do not write when source is invalid or out-of-range.
-      `bound_ctrl = true`: Use zero as input if source is invalid or out-of-range
+      a disabled lane: use the value zero, or disable the write. 
+      `bound_ctrl = false`: Do not write when source is from a disabled lane
+      `bound_ctrl = true`: Use zero as input if source is from a disabled lane
 
     Note: Lowering is only supported on gfx950 and up.
   }];

>From 1b2e5ff24ef6ef43b445e6f456127cfd218fc233 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Thu, 21 Aug 2025 10:11:38 -0500
Subject: [PATCH 12/12] rename to permlane_swap

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 38 +++++++------------
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 19 +++++-----
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 12 ++++++
 .../Conversion/AMDGPUToROCDL/permlane.mlir    | 22 +++++------
 mlir/test/Dialect/AMDGPU/ops.mlir             |  8 ++--
 5 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 0a544164fb71b..72aca2938e029 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -656,40 +656,27 @@ def AMDGPU_SwizzleBitModeOp : AMDGPU_Op<"swizzle_bitmode",
   }];
 }
 
-def AMDGPU_PermlanePerm : I32EnumAttr<"PermlanePerm",
-    "The possible permutations for a permlane operation",
-    [
-      I32EnumAttrCase<"swap_16",  0>,
-      I32EnumAttrCase<"swap_32",  1>,
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::amdgpu";
-}
-
-def AMDGPU_PermlanePermAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_PermlanePerm,
-  "permlane_perm">;
-
-def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "src"]>]> {
-  let summary = "AMDGPU permlane op";
+def AMDGPU_PermlaneSwapOp : AMDGPU_Op<"permlane_swap", [Pure, AllTypesMatch<["result", "src"]>]> {
+  let summary = "AMDGPU permlane swap op";
   let description = [{
-    High-level wrapper on `rocdl.permlane.*` variants for permutations
-     on rows of lanes in a subgroup.
+    High-level wrapper on `rocdl.permlane{16,32}.swap` variants for permutations
+    on rows of lanes in a subgroup.
 
     Supports arbitrary int/float/vector types, which will be repacked to i32 and
-    one or more `rocdl.permlane.*` ops during lowering.
+    one or more `rocdl.permlane_swap` ops during lowering.
     Supported lane permutations:
-    - Swap the data between odd and even rows of 16 lanes (`swap_16`)
-    - Swap the data between the first 32 lanes and the last 32 lanes (`swap_32`)
+    - Swap the data between odd and even rows of 16 lanes
+    - Swap the data between the first 32 lanes and the last 32 lanes
 
     Example:
     ```mlir
-    %0 = amdgpu.permlane %src swap_16 : f16
-    %1 = amdgpu.permlane %src swap_32 { fetch_inactive = true, bound_ctrl = true } : f16
+    %0 = amdgpu.permlane %src 16 : f16
+    %1 = amdgpu.permlane %src 32 { fetch_inactive = true, bound_ctrl = true } : f16
     ```
 
     Operands:
     * `$src`: Vector register to permute across lanes of the subgroup.
-    * `$kind`: The kind of permutation operation.
+    * `$row_length`: The length of a row to permute in number of lanes (valid values are 16 and 32).
     * `$fetch_inactive`: Optional. Used to dertermine behavior of a fetch from a disabled lane.
       `fetch_inactive = false`: If the source lane is disabled, use `bound_ctrl` to determine the source value.
       `fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway (ignoring `bound_ctrl`).
@@ -701,13 +688,14 @@ def AMDGPU_PermlaneOp : AMDGPU_Op<"permlane", [Pure, AllTypesMatch<["result", "s
     Note: Lowering is only supported on gfx950 and up.
   }];
   let arguments = (ins AnyIntegerOrFloatOr1DVector:$src,
-                       AMDGPU_PermlanePermAttr:$kind,
+                       I32Attr:$row_length,
                        DefaultValuedAttr<BoolAttr, "false">:$fetch_inactive,
                        DefaultValuedAttr<BoolAttr, "false">:$bound_ctrl);
   let results = (outs AnyIntegerOrFloatOr1DVector:$result);
   let assemblyFormat = [{
-    $src $kind attr-dict `:` type($result)
+    $src $row_length attr-dict `:` type($result)
   }];
+  let hasVerifier = 1;
 }
 
 def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 605ab4ec757d1..b44d647cf7632 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1877,15 +1877,15 @@ struct AMDGPUSwizzleBitModeLowering
   }
 };
 
-struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
+struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
   AMDGPUPermlaneLowering(const LLVMTypeConverter &converter, Chipset chipset)
-      : ConvertOpToLLVMPattern<PermlaneOp>(converter), chipset(chipset) {}
+      : ConvertOpToLLVMPattern<PermlaneSwapOp>(converter), chipset(chipset) {}
   Chipset chipset;
 
   LogicalResult
-  matchAndRewrite(PermlaneOp op, OpAdaptor adaptor,
+  matchAndRewrite(PermlaneSwapOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     if (chipset < kGfx950)
       return op->emitOpError("permlane_swap is only supported on gfx950+");
@@ -1893,7 +1893,7 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
     Location loc = op.getLoc();
     Type i32 = rewriter.getI32Type();
     Value src = adaptor.getSrc();
-    auto kind = op.getKind();
+    unsigned row_length = op.getRowLength();
     bool fi = op.getFetchInactive();
     bool boundctrl = op.getBoundCtrl();
 
@@ -1905,16 +1905,15 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneOp> {
       Value res;
       Type i32pair = LLVM::LLVMStructType::getLiteral(
           rewriter.getContext(), {v.getType(), v.getType()});
-      switch (kind) {
-      case PermlanePerm::swap_16:
+
+      if (row_length == 16)
         res = ROCDL::Permlane16SwapOp::create(rewriter, loc, i32pair, v, v, fi,
                                               boundctrl);
-        break;
-      case PermlanePerm::swap_32:
+      else if (row_length == 32)
         res = ROCDL::Permlane32SwapOp::create(rewriter, loc, i32pair, v, v, fi,
                                               boundctrl);
-        break;
-      }
+      else
+        llvm_unreachable("unsupported row length");
 
       Value vdstNew = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
       permuted.emplace_back(vdstNew);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index d7ffdcb58ddb5..11a40d663a201 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -510,6 +510,18 @@ LogicalResult DPPOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// PermlaneSwapOp
+//===----------------------------------------------------------------------===//
+LogicalResult PermlaneSwapOp::verify() {
+  unsigned rowLength = getRowLength();
+
+  if (rowLength != 16 && rowLength != 32)
+    return emitOpError("row_length attribute must either be 16 or 32.");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GatherToLDSOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
index fa0049274d9cf..aae2b1d0fd90c 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/permlane.mlir
@@ -6,7 +6,7 @@ func.func @test_permlane16_i32(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  return %[[RES]] : i32
-  %0 = amdgpu.permlane %arg0 swap_16 : i32
+  %0 = amdgpu.permlane_swap %arg0 16 : i32
   return %0 : i32
 }
 
@@ -16,7 +16,7 @@ func.func @test_permlane16_i32_optional_attr(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane16.swap %[[ARG0]], %[[ARG0]], true, true : (i32, i32) -> <(i32, i32)>
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  return %[[RES]] : i32
-  %0 = amdgpu.permlane %arg0 swap_16 { fetch_inactive = true, bound_ctrl = true }  : i32
+  %0 = amdgpu.permlane_swap %arg0 16 { fetch_inactive = true, bound_ctrl = true }  : i32
   return %0 : i32
 }
 
@@ -26,7 +26,7 @@ func.func @test_permlane32_i32(%arg0 : i32) -> i32 {
 // CHECK:  %[[PERM:.*]] = rocdl.permlane32.swap %[[ARG0]], %[[ARG0]], false, false : (i32, i32) -> <(i32, i32)>
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  return %[[RES]] : i32
-  %0 = amdgpu.permlane %arg0 swap_32 : i32
+  %0 = amdgpu.permlane_swap %arg0 32 : i32
   return %0 : i32
 }
 
@@ -38,7 +38,7 @@ func.func @test_permlane16_f32(%arg0 : f32) -> f32 {
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
 // CHECK:  return %[[RES_CAST]] : f32
-  %0 = amdgpu.permlane %arg0 swap_16 : f32
+  %0 = amdgpu.permlane_swap %arg0 16 : f32
   return %0 : f32
 }
 
@@ -50,7 +50,7 @@ func.func @test_permlane32_f32(%arg0 : f32) -> f32 {
 // CHECK:  %[[RES:.*]] = llvm.extractvalue %[[PERM]][0] : !llvm.struct<(i32, i32)>
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[RES]] : i32 to f32
 // CHECK:  return %[[RES_CAST]] : f32
-  %0 = amdgpu.permlane %arg0 swap_32 : f32
+  %0 = amdgpu.permlane_swap %arg0 32 : f32
   return %0 : f32
 }
 
@@ -64,7 +64,7 @@ func.func @test_permlane16_f16(%arg0 : f16) -> f16 {
 // CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
 // CHECK:  return %[[RES_CAST]] : f16
-  %0 = amdgpu.permlane %arg0 swap_16 : f16
+  %0 = amdgpu.permlane_swap %arg0 16 : f16
   return %0 : f16
 }
 
@@ -78,7 +78,7 @@ func.func @test_permlane32_f16(%arg0 : f16) -> f16 {
 // CHECK:  %[[TRUNC:.*]] = llvm.trunc %[[RES]] : i32 to i16
 // CHECK:  %[[RES_CAST:.*]] = llvm.bitcast %[[TRUNC]] : i16 to f16
 // CHECK:  return %[[RES_CAST]] : f16
-  %0 = amdgpu.permlane %arg0 swap_32 : f16
+  %0 = amdgpu.permlane_swap %arg0 32 : f16
   return %0 : f16
 }
 
@@ -97,7 +97,7 @@ func.func @test_permlane16_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
-  %0 = amdgpu.permlane %arg0 swap_16 : vector<2xi32>
+  %0 = amdgpu.permlane_swap %arg0 16 : vector<2xi32>
   return %0 : vector<2xi32>
 }
 
@@ -116,7 +116,7 @@ func.func @test_permlane32_2xi32(%arg0 : vector<2xi32>) -> vector<2xi32> {
 // CHECK:      %[[VEC_INSERT0:.*]] = llvm.insertelement %[[PERM0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      return %[[VEC_INSERT1]] : vector<2xi32>
-  %0 = amdgpu.permlane %arg0 swap_32 : vector<2xi32>
+  %0 = amdgpu.permlane_swap %arg0 32 : vector<2xi32>
   return %0 : vector<2xi32>
 }
 
@@ -137,7 +137,7 @@ func.func @test_permlane16_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
 // CHECK:      return %[[CAST2]] : vector<4xf16>
-  %0 = amdgpu.permlane %arg0 swap_16 : vector<4xf16>
+  %0 = amdgpu.permlane_swap %arg0 16 : vector<4xf16>
   return %0 : vector<4xf16>
 }
 
@@ -158,6 +158,6 @@ func.func @test_permlane32_4xf16(%arg0 : vector<4xf16>) -> vector<4xf16> {
 // CHECK:      %[[VEC_INSERT1:.*]] = llvm.insertelement %[[PERM1]], %[[VEC_INSERT0]][%[[C1]] : i32] : vector<2xi32>
 // CHECK:      %[[CAST2:.*]] = llvm.bitcast %[[VEC_INSERT1]] : vector<2xi32> to vector<4xf16>
 // CHECK:      return %[[CAST2]] : vector<4xf16>
-  %0 = amdgpu.permlane %arg0 swap_32 : vector<4xf16>
+  %0 = amdgpu.permlane_swap %arg0 32 : vector<4xf16>
   return %0 : vector<4xf16>
 }
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 5eb6d53d4d9fe..369e0fff538e1 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -526,15 +526,15 @@ func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
 
 // CHECK-LABEL: func @permlane16_swap
 func.func @permlane16_swap(%arg0 : f32) -> f32 {
-  // CHECK: amdgpu.permlane
-  %0 = amdgpu.permlane %arg0 swap_16 : f32
+  // CHECK: amdgpu.permlane_swap
+  %0 = amdgpu.permlane_swap %arg0 16 : f32
   func.return %0 : f32
 }
 
 // CHECK-LABEL: func @permlane32_swap
 func.func @permlane32_swap(%arg0 : f32) -> f32 {
-  // CHECK: amdgpu.permlane
-  %0 = amdgpu.permlane %arg0 swap_32 : f32
+  // CHECK: amdgpu.permlane_swap
+  %0 = amdgpu.permlane_swap %arg0 32 : f32
   func.return %0 : f32
 }