[Mlir-commits] [mlir] [MLIR][AMDGPU] Fixing word alignment check for bufferload fastpath (PR #135982)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 16 09:14:45 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Zhuoran Yin (jerryyin)
<details>
<summary>Changes</summary>
`delta_bytes % (32 ceilDiv elementBitwidth) != 0` condition is incorrect in https://github.com/llvm/llvm-project/pull/135014
For example, last load is issued to load only one last element of fp16. Then `delta bytes = 2`, `(32 ceildiv 16) = 2`. In this case it will be judged as word aligned. It will send to fast path but get all zeros for the fp16 because it cross the word boundary.
In reality the equation should be just `delta_bytes % 4` , since a word is 4 bytes. This PR fix the bug by amending the mod target to 4.
---
Full diff: https://github.com/llvm/llvm-project/pull/135982.diff
2 Files Affected:
- (modified) mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp (+3-4)
- (modified) mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir (+6-9)
``````````diff
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
index f665c1794cdd4..7268bd56f43f6 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
@@ -225,15 +225,14 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
Value isOutofBounds = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, delta, vectorSizeOffset);
- // 2) check if (detla_bytes % (32 / elementBitwidth) != 0)
+ // 2) check if (detla_bytes % bytes_per_word != 0)
Value deltaBytes = rewriter.create<arith::MulIOp>(
loc, delta,
rewriter.create<arith::ConstantIndexOp>(loc, elementBitWidth / 8));
- Value elementsPerWord = rewriter.create<arith::ConstantIndexOp>(
- loc, llvm::divideCeil(32, elementBitWidth));
+ Value bytesPerWord = rewriter.create<arith::ConstantIndexOp>(loc, 4);
Value isNotWordAligned = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ne,
- rewriter.create<arith::RemUIOp>(loc, deltaBytes, elementsPerWord),
+ rewriter.create<arith::RemUIOp>(loc, deltaBytes, bytesPerWord),
rewriter.create<arith::ConstantIndexOp>(loc, 0));
// We take the fallback of transfer_read default lowering only it is both
diff --git a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
index d0805b6b8a973..d41adf3ad2c7f 100644
--- a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
+++ b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
@@ -10,8 +10,7 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad
return %res : vector<4xf32>
}
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
// CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]]
// CHECK: } else {
@@ -35,14 +34,14 @@ func.func @transfer_to_maskedload_fatrawbuffer_f16(%mem : memref<8x8xf16, #amdgp
// CHECK-DAG: %[[C0:.*]] = arith.constant 0
// CHECK-DAG: %[[SIZE:.*]] = arith.constant 64
// CHECK-DAG: %[[BYTES:.*]] = arith.constant 2
-// CHECK-DAG: %[[VECTORSIZE:.*]] = arith.constant 4
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4
// CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[ARG2]]]
// CHECK: %[[DELTA:.*]] = arith.subi %[[SIZE]], %[[LINEAR]]
-// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[VECTORSIZE]]
+// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[C4]]
// CHECK: %[[DELTABYTES:.*]] = arith.muli %[[DELTA]], %[[BYTES]]
-// CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[BYTES]]
+// CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[C4]]
// CHECK: %[[COND2:.*]] = arith.cmpi ne, %[[REM]], %[[C0]]
// CHECK: %[[COND:.*]] = arith.andi %[[COND1]], %[[COND2]]
@@ -120,8 +119,7 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space<fa
return %res : vector<4xf32>
}
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32>
@@ -140,7 +138,6 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
return %res : vector<1xf32>
}
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<1xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
// CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]]
// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
``````````
</details>
https://github.com/llvm/llvm-project/pull/135982
More information about the Mlir-commits
mailing list