[Mlir-commits] [mlir] 8b907a3 - [mlir] GPUToROCDL: repack usupported types when lowering `subgroup_broadcast` (#174206)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Jan 6 12:26:51 PST 2026
Author: Ivan Butygin
Date: 2026-01-06T23:26:47+03:00
New Revision: 8b907a3a2072fcf05b7c85cbb7ad46c379eafaee
URL: https://github.com/llvm/llvm-project/commit/8b907a3a2072fcf05b7c85cbb7ad46c379eafaee
DIFF: https://github.com/llvm/llvm-project/commit/8b907a3a2072fcf05b7c85cbb7ad46c379eafaee.diff
LOG: [mlir] GPUToROCDL: repack usupported types when lowering `subgroup_broadcast` (#174206)
Use the same repacking logic as for shuffle/swizzle.
Added:
Modified:
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b8eb6d7facc6d..e5a40cfd090bd 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -171,11 +171,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
};
static bool isSupportedReadLaneType(Type type) {
- // read(first)lane also supports some vector types, but limit it for scalars
- // for now.
- return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
- isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
- LLVM::LLVMPointerType>(type);
+ // https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
+ if (isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
+ LLVM::LLVMPointerType>(type))
+ return true;
+
+ if (auto intType = dyn_cast<IntegerType>(type))
+ return llvm::is_contained({16, 32, 64},
+ static_cast<int>(intType.getWidth()));
+
+ if (auto vecType = dyn_cast<VectorType>(type)) {
+ Type elementType = vecType.getElementType();
+ if (elementType.isInteger(32))
+ return true;
+
+ if (vecType.getNumElements() == 2 &&
+ (isa<Float16Type, BFloat16Type>(elementType) ||
+ elementType.isInteger(16)))
+ return true;
+ }
+
+ return false;
}
struct GPUSubgroupBroadcastOpToROCDL
@@ -186,17 +202,38 @@ struct GPUSubgroupBroadcastOpToROCDL
matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value src = adaptor.getSrc();
- if (!isSupportedReadLaneType(src.getType()))
- return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+ if (isSupportedReadLaneType(src.getType())) {
+ Value result = createReadlaneOp(op, adaptor, rewriter, src);
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+
+ Type i32 = rewriter.getI32Type();
+ Location loc = op.getLoc();
+ SmallVector<Value> decomposed =
+ LLVM::decomposeValue(rewriter, loc, src, i32);
+ SmallVector<Value> results;
+ results.reserve(decomposed.size());
+ for (Value v : decomposed)
+ results.emplace_back(createReadlaneOp(op, adaptor, rewriter, v));
+
+ Value result = LLVM::composeValue(rewriter, loc, results, src.getType());
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+
+private:
+ static Value createReadlaneOp(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Value src) {
if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
- rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
- adaptor.getLane());
+ return ROCDL::ReadlaneOp::create(rewriter, op.getLoc(), src.getType(),
+ src, adaptor.getLane());
} else { // first_active_lane
- rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
- src);
+ return ROCDL::ReadfirstlaneOp::create(rewriter, op.getLoc(),
+ src.getType(), src);
}
- return success();
}
};
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index ef631ce8a12e5..2fd923743281e 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -821,4 +821,41 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
func.return %0, %1 : index, index
}
+
+// CHECK-LABEL: func @broadcast_i8
+// CHECK-SAME: (%[[ARG:.*]]: i8, %[[IDX:.*]]: i32)
+func.func @broadcast_i8(%arg0 : i8, %arg1 : i32) -> (i8, i8) {
+// CHECK: %[[I32_1:.*]] = llvm.zext %[[ARG]] : i8 to i32
+// CHECK: %[[R1:.*]] = rocdl.readfirstlane %[[I32_1]] : i32
+// CHECK: %[[RES1:.*]] = llvm.trunc %[[R1]] : i32 to i8
+// CHECK: %[[I32_2:.*]] = llvm.zext %[[ARG]] : i8 to i32
+// CHECK: %[[R2:.*]] = rocdl.readlane %[[I32_2]], %[[IDX]] : (i32, i32) -> i32
+// CHECK: %[[RES2:.*]] = llvm.trunc %[[R2]] : i32 to i8
+// CHECK: %{{.*}} = llvm.insertvalue %[[RES1]], %{{.*}}[0] : !llvm.struct<(i8, i8)>
+// CHECK: %{{.*}} = llvm.insertvalue %[[RES2]], %{{.*}}[1] : !llvm.struct<(i8, i8)>
+ %0 = gpu.subgroup_broadcast %arg0, first_active_lane : i8
+ %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : i8
+ func.return %0, %1 : i8, i8
+}
+
+// CHECK-LABEL: func @broadcast_4xi16
+// CHECK-SAME: (%[[ARG:.*]]: vector<4xi16>)
+func.func @broadcast_4xi16(%arg0 : vector<4xi16>) -> vector<4xi16> {
+// CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[ARG:.*]] : vector<4xi16> to vector<2xi32>
+// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[ELEM0:.*]] = llvm.extractelement %[[BITCAST]][%[[C0]] : i32] : vector<2xi32>
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[ELEM1:.*]] = llvm.extractelement %[[BITCAST]][%[[C1]] : i32] : vector<2xi32>
+// CHECK: %[[RFL0:.*]] = rocdl.readfirstlane %[[ELEM0]] : i32
+// CHECK: %[[RFL1:.*]] = rocdl.readfirstlane %[[ELEM1]] : i32
+// CHECK: %[[POISON:.*]] = llvm.mlir.poison : vector<2xi32>
+// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[INS0:.*]] = llvm.insertelement %[[RFL0]], %[[POISON]][%[[C0]] : i32] : vector<2xi32>
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[INS1:.*]] = llvm.insertelement %[[RFL1]], %[[INS0]][%[[C1]] : i32] : vector<2xi32>
+// CHECK: %[[BITCAST2:.*]] = llvm.bitcast %[[INS1]] : vector<2xi32> to vector<4xi16>
+// CHECK: return %[[BITCAST2]] : vector<4xi16>
+ %0 = gpu.subgroup_broadcast %arg0, first_active_lane : vector<4xi16>
+ func.return %0 : vector<4xi16>
+}
}
More information about the Mlir-commits
mailing list