[Mlir-commits] [mlir] [MLIR][AMDGPU] Fixing word alignment check for bufferload fastpath (PR #135982)

Wed Apr 16 09:14:11 PDT 2025

https://github.com/jerryyin created https://github.com/llvm/llvm-project/pull/135982

`delta_bytes % (32 ceilDiv elementBitwidth) != 0` condition is incorrect in https://github.com/llvm/llvm-project/pull/135014

For example, last load is issued to load only one last element of fp16. Then `delta bytes = 2`, `(32 ceildiv 16) = 2`. In this case it will be judged as word aligned. It will send to fast path but get all zeros for the fp16 because it cross the word boundary.

In reality the equation should be just `delta_bytes % 4` , since a word is 4 bytes. This PR fix the bug by amending the mod target to 4.

>From 68781c2fdda5543e0c3f522cc764d96ff3587a7e Mon Sep 17 00:00:00 2001
From: jerryyin <zhuoryin at amd.com>
Date: Wed, 16 Apr 2025 16:10:56 +0000
Subject: [PATCH] Fixing word alignment check for bufferload fastpath

---
 .../AMDGPU/Transforms/TransferReadToLoad.cpp      |  7 +++----
 .../Dialect/AMDGPU/transfer-read-to-load.mlir     | 15 ++++++---------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
index f665c1794cdd4..7268bd56f43f6 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
@@ -225,15 +225,14 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
     Value isOutofBounds = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ult, delta, vectorSizeOffset);
 
-    // 2) check if (detla_bytes % (32 / elementBitwidth) != 0)
+    // 2) check if (detla_bytes % bytes_per_word != 0)
     Value deltaBytes = rewriter.create<arith::MulIOp>(
         loc, delta,
         rewriter.create<arith::ConstantIndexOp>(loc, elementBitWidth / 8));
-    Value elementsPerWord = rewriter.create<arith::ConstantIndexOp>(
-        loc, llvm::divideCeil(32, elementBitWidth));
+    Value bytesPerWord = rewriter.create<arith::ConstantIndexOp>(loc, 4);
     Value isNotWordAligned = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ne,
-        rewriter.create<arith::RemUIOp>(loc, deltaBytes, elementsPerWord),
+        rewriter.create<arith::RemUIOp>(loc, deltaBytes, bytesPerWord),
         rewriter.create<arith::ConstantIndexOp>(loc, 0));
 
     // We take the fallback of transfer_read default lowering only it is both
diff --git a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
index d0805b6b8a973..d41adf3ad2c7f 100644
--- a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
+++ b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
@@ -10,8 +10,7 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad
   return %res : vector<4xf32>
 }
 
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
 // CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]]
 
 // CHECK: } else {
@@ -35,14 +34,14 @@ func.func @transfer_to_maskedload_fatrawbuffer_f16(%mem : memref<8x8xf16, #amdgp
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0
 // CHECK-DAG: %[[SIZE:.*]] = arith.constant 64
 // CHECK-DAG: %[[BYTES:.*]] = arith.constant 2
-// CHECK-DAG: %[[VECTORSIZE:.*]] = arith.constant 4
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4
 
 // CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[ARG2]]]
 // CHECK: %[[DELTA:.*]] = arith.subi %[[SIZE]], %[[LINEAR]]
-// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[VECTORSIZE]]
+// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[C4]]
 
 // CHECK: %[[DELTABYTES:.*]] = arith.muli %[[DELTA]], %[[BYTES]]
-// CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[BYTES]]
+// CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[C4]]
 // CHECK: %[[COND2:.*]] = arith.cmpi ne, %[[REM]], %[[C0]]
 
 // CHECK: %[[COND:.*]] = arith.andi %[[COND1]], %[[COND2]]
@@ -120,8 +119,7 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space<fa
   return %res : vector<4xf32>
 }
 // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
 // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
 // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
 // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32>
@@ -140,7 +138,6 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
   return %res : vector<1xf32>
 }
 // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[FALSE:.*]] = arith.constant false
-// CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<1xf32>) {
+// CHECK: %[[IF:.*]] = scf.if
 // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]]
 // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]