[Mlir-commits] [mlir] 960ab52 - [mlir][nvgpu] Verify invalid copy size (nfc)

Mon Jul 17 08:09:38 PDT 2023

Author: Guray Ozen
Date: 2023-07-17T17:09:33+02:00
New Revision: 960ab5225bea8f58e87eab8fd46d55ce9562b33d

URL: https://github.com/llvm/llvm-project/commit/960ab5225bea8f58e87eab8fd46d55ce9562b33d
DIFF: https://github.com/llvm/llvm-project/commit/960ab5225bea8f58e87eab8fd46d55ce9562b33d.diff

LOG: [mlir][nvgpu] Verify invalid copy size (nfc)

This work improves verifier for invalid cases. It is NFC.

Reviewed By: nicolasvasilache, springerm

Differential Revision: https://reviews.llvm.org/D155448

Added: 
    

Modified: 
    mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
    mlir/test/Dialect/NVGPU/invalid.mlir
    mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 2868660dc656ee..07c29541faf416 100644

--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -76,10 +77,23 @@ LogicalResult DeviceAsyncCopyOp::verify() {
     return emitOpError() << "expected " << dstMemref.getRank()
                          << " destination indices, got "
                          << getDstIndices().size();
+  int64_t dstElements = getDstElements().getZExtValue();
+  int64_t sizeInBytes = (dstMemref.getElementTypeBitWidth() * dstElements) / 8;
+  if (sizeInBytes != 4 && sizeInBytes != 8 && sizeInBytes != 16) {
+    unsigned dstWidth = dstMemref.getElementTypeBitWidth();
+    InFlightDiagnostic diag = emitError();
+    diag << "Requested copy elements is " << dstElements << " with width "
+         << dstMemref.getElementTypeBitWidth()
+         << ". But copy elements could be one of ";
+    if ((32 / dstWidth) > 0)
+      diag << (32 / dstWidth) << ", ";
+    if ((64 / dstWidth) > 0)
+      diag << (64 / dstWidth) << ", ";
+    if ((128 / dstWidth) > 0)
+      diag << (128 / dstWidth) << ".";
+    return diag;
+  }
   if (getBypassL1().has_value()) {
-    int64_t dstElements = getDstElements().getZExtValue();
-    int64_t sizeInBytes =
-        (dstMemref.getElementTypeBitWidth() * dstElements) / 8;
     int64_t req = 16 * 8 / dstMemref.getElementTypeBitWidth();
     if (getBypassL1().value() && sizeInBytes != 16) {
       return emitOpError() << "bypassL1 does not satify alignment for "

diff  --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index a0a8a115a4f424..ef721b18014071 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -194,3 +194,30 @@ func.func @async_cp_zfill_f32_align1(
   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3>
   return
 }
+
+// -----
+
+func.func @async_cp_size_invalid_f32(
+  %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {
+    // expected-error @+1 {{Requested copy elements is 3 with width 32. But copy elements could be one of 1, 2, 4.}}
+  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf32> to memref<3x16x128xf32, 3>
+  return
+}
+
+// -----
+
+func.func @async_cp_size_invalid_f16(
+  %src: memref<128x128xf16>, %dst: memref<3x16x128xf16, 3>, %i : index) {
+    // expected-error @+1 {{Requested copy elements is 3 with width 16. But copy elements could be one of 2, 4, 8.}}
+  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf16> to memref<3x16x128xf16, 3>
+  return
+}
+
+// -----
+
+func.func @async_cp_size_invalid_f64(
+  %src: memref<128x128xf64>, %dst: memref<3x16x128xf64, 3>, %i : index) {
+    // expected-error @+1 {{Requested copy elements is 3 with width 64. But copy elements could be one of 1, 2.}}
+  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf64> to memref<3x16x128xf64, 3>
+  return
+}

diff  --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
index 010b3252d21751..5a212815ceb2a1 100644
--- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
+++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
@@ -74,7 +74,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c1]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
-  %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
+  %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<64x16xf32, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}
@@ -130,7 +130,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   // CHECK: [[xorBits:%.+]] = arith.shli [[src_bits]], [[c2]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
-  %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
+  %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<16x64xf32, 3>
   %3 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}
@@ -175,7 +175,7 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
   // CHECK: [[xorBits:%.+]] = arith.shrui [[src_bits]], [[c1]]
   // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol]], [[xorBits]]
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
-  %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
+  %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
       : memref<32x32xf64> to memref<32x4xf64, 3>
   %1 = nvgpu.device_async_create_group %0
   nvgpu.device_async_wait %1 { numGroups = 1 : i32}