[mlir] [llvm] [mlir] Lower math dialect later in gpu-lower-to-nvvm-pipeline (PR #78556)

Thu Jan 18 01:49:50 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-mlir-gpu

Author: Guray Ozen (grypp)

<details>
<summary>Changes</summary>

This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by `createConvertGpuOpsToNVVMOps` for GPU target, and it needs to run it first.

---

Patch is 89.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78556.diff


3 Files Affected:

- (modified) mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp (+1-1) 
- (added) mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir (+18) 
- (added) stmatrix/everything-casted.mlir (+1267) 


``````````diff

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 0b4739214bf2f1..935f0deaf9c8a6 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -51,7 +51,6 @@ void buildCommonPassPipeline(
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertNVVMToLLVMPass());
-  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
 
@@ -98,6 +97,7 @@ void buildHostPostPipeline(OpPassManager &pm,
   GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
   gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
   pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+  pm.addPass(createConvertMathToLLVMPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
new file mode 100644
index 00000000000000..7c55059f4a84dc
--- /dev/null
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: llvm.func @test_math(%arg0: f32) {
+func.func @test_math(%arg0 : f32) {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    // CHECK: gpu.launch_func  @test_math_kernel::@test_math_kernel
+    // CHECK: gpu.binary @test_math_kernel  [#gpu.object<#nvvm.target
+    gpu.launch 
+        blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1) 
+        threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
+         %s1 = math.exp %arg0 : f32
+        gpu.printf "%f" %s1 : f32
+        gpu.printf "Hello from %d\n" %6 : index
+        gpu.terminator
+    }
+    return
+}
\ No newline at end of file
diff --git a/stmatrix/everything-casted.mlir b/stmatrix/everything-casted.mlir
new file mode 100644
index 00000000000000..28ebad10c6a3ce
--- /dev/null
+++ b/stmatrix/everything-casted.mlir
@@ -0,0 +1,1267 @@
+// -----// IR Dump After ConvertNVGPUToNVVMPass (convert-nvgpu-to-nvvm) //----- //
+module {
+  gpu.module @asd {
+    gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+      %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+      memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+      %1 = llvm.mlir.constant(0 : i32) : i32
+      %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+      %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+      nvvm.wgmma.fence.aligned
+      %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+      %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      nvvm.wgmma.commit.group.sync.aligned
+      nvvm.wgmma.wait.group.sync.aligned 1
+      %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %20 = llvm.mlir.constant(1 : i32) : i32
+      %21 = llvm.mlir.constant(2 : i32) : i32
+      %22 = llvm.mlir.constant(4 : i32) : i32
+      %23 = llvm.mlir.constant(8 : i32) : i32
+      %24 = llvm.mlir.constant(16 : i32) : i32
+      %25 = llvm.mlir.constant(32 : i32) : i32
+      %26 = nvvm.read.ptx.sreg.tid.x : i32
+      %27 = llvm.urem %26, %25  : i32
+      %28 = llvm.udiv %26, %25  : i32
+      %29 = llvm.udiv %27, %22  : i32
+      %30 = llvm.urem %27, %22  : i32
+      %31 = llvm.mul %30, %21  : i32
+      %32 = llvm.mul %28, %24  : i32
+      %33 = llvm.add %29, %32  : i32
+      %34 = llvm.mlir.constant(0 : i32) : i32
+      %35 = llvm.mul %34, %23  : i32
+      %36 = llvm.add %33, %35  : i32
+      %37 = llvm.mlir.constant(0 : i32) : i32
+      %38 = llvm.mul %37, %23  : i32
+      %39 = llvm.add %31, %38  : i32
+      %40 = arith.index_cast %36 : i32 to index
+      %41 = arith.index_cast %39 : i32 to index
+      %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+      vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+      %44 = llvm.mlir.constant(1 : i32) : i32
+      %45 = llvm.mul %44, %23  : i32
+      %46 = llvm.add %33, %45  : i32
+      %47 = llvm.mlir.constant(0 : i32) : i32
+      %48 = llvm.mul %47, %23  : i32
+      %49 = llvm.add %31, %48  : i32
+      %50 = arith.index_cast %46 : i32 to index
+      %51 = arith.index_cast %49 : i32 to index
+      %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+      vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+      gpu.return
+    }
+  }
+}
+
+
+// -----// IR Dump After GpuKernelOutlining (gpu-kernel-outlining) //----- //
+module {
+  gpu.module @asd {
+    gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+      %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+      memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+      %1 = llvm.mlir.constant(0 : i32) : i32
+      %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+      %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+      nvvm.wgmma.fence.aligned
+      %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+      %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      nvvm.wgmma.commit.group.sync.aligned
+      nvvm.wgmma.wait.group.sync.aligned 1
+      %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %20 = llvm.mlir.constant(1 : i32) : i32
+      %21 = llvm.mlir.constant(2 : i32) : i32
+      %22 = llvm.mlir.constant(4 : i32) : i32
+      %23 = llvm.mlir.constant(8 : i32) : i32
+      %24 = llvm.mlir.constant(16 : i32) : i32
+      %25 = llvm.mlir.constant(32 : i32) : i32
+      %26 = nvvm.read.ptx.sreg.tid.x : i32
+      %27 = llvm.urem %26, %25  : i32
+      %28 = llvm.udiv %26, %25  : i32
+      %29 = llvm.udiv %27, %22  : i32
+      %30 = llvm.urem %27, %22  : i32
+      %31 = llvm.mul %30, %21  : i32
+      %32 = llvm.mul %28, %24  : i32
+      %33 = llvm.add %29, %32  : i32
+      %34 = llvm.mlir.constant(0 : i32) : i32
+      %35 = llvm.mul %34, %23  : i32
+      %36 = llvm.add %33, %35  : i32
+      %37 = llvm.mlir.constant(0 : i32) : i32
+      %38 = llvm.mul %37, %23  : i32
+      %39 = llvm.add %31, %38  : i32
+      %40 = arith.index_cast %36 : i32 to index
+      %41 = arith.index_cast %39 : i32 to index
+      %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+      vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+      %44 = llvm.mlir.constant(1 : i32) : i32
+      %45 = llvm.mul %44, %23  : i32
+      %46 = llvm.add %33, %45  : i32
+      %47 = llvm.mlir.constant(0 : i32) : i32
+      %48 = llvm.mul %47, %23  : i32
+      %49 = llvm.add %31, %48  : i32
+      %50 = arith.index_cast %46 : i32 to index
+      %51 = arith.index_cast %49 : i32 to index
+      %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+      vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+      gpu.return
+    }
+  }
+}
+
+
+// -----// IR Dump After ConvertVectorToSCF (convert-vector-to-scf) //----- //
+module {
+  gpu.module @asd {
+    gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+      %0 = llvm.mlir.constant(32 : i32) : i32
+      %1 = llvm.mlir.constant(16 : i32) : i32
+      %2 = llvm.mlir.constant(8 : i32) : i32
+      %3 = llvm.mlir.constant(4 : i32) : i32
+      %4 = llvm.mlir.constant(2 : i32) : i32
+      %5 = llvm.mlir.constant(1 : i32) : i32
+      %6 = llvm.mlir.constant(0 : i32) : i32
+      memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+      %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      nvvm.wgmma.fence.aligned
+      %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+      nvvm.wgmma.commit.group.sync.aligned
+      nvvm.wgmma.wait.group.sync.aligned 1
+      %18 = nvvm.read.ptx.sreg.tid.x : i32
+      %19 = llvm.urem %18, %0  : i32
+      %20 = llvm.udiv %18, %0  : i32
+      %21 = llvm.udiv %19, %3  : i32
+      %22 = llvm.urem %19, %3  : i32
+      %23 = llvm.mul %22, %4  : i32
+      %24 = llvm.mul %20, %1  : i32
+      %25 = llvm.add %21, %24  : i32
+      %26 = llvm.mul %6, %2  : i32
+      %27 = llvm.add %25, %26  : i32
+      %28 = llvm.mul %6, %2  : i32
+      %29 = llvm.add %23, %28  : i32
+      %30 = arith.index_cast %27 : i32 to index
+      %31 = arith.index_cast %29 : i32 to index
+      %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+      vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+      %34 = llvm.mul %5, %2  : i32
+      %35 = llvm.add %25, %34  : i32
+      %36 = llvm.mul %6, %2  : i32
+      %37 = llvm.add %23, %36  : i32
+      %38 = arith.index_cast %35 : i32 to index
+      %39 = arith.index_cast %37 : i32 to index
+      %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+      vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+      gpu.return
+    }
+  }
+}
+
+
+// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
+module {
+  gpu.module @asd {
+    gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+      %0 = llvm.mlir.constant(32 : i32) : i32
+      %1 = llvm.mlir.constant(16 : i32) : i32
+      %2 = llvm.mlir.constant(8 : i32) : i32
+      %3 = llvm.mlir.constant(4 : i32) : i32
+      %4 = llvm.mlir.constant(2 : i32) : i32
+      %5 = llvm.mlir.constant(1 : i32) : i32
+      %6 = llvm.mlir.constant(0 : i32) : i32
+      memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+      %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      nvvm.wgmma.fence.aligned
+      %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+      nvvm.wgmma.commit.group.sync.aligned
+      nvvm.wgmma.wait.group.sync.aligned 1
+      %18 = nvvm.read.ptx.sreg.tid.x : i32
+      %19 = llvm.urem %18, %0  : i32
+      %20 = llvm.udiv %18, %0  : i32
+      %21 = llvm.udiv %19, %3  : i32
+      %22 = llvm.urem %19, %3  : i32
+      %23 = llvm.mul %22, %4  : i32
+      %24 = llvm.mul %20, %1  : i32
+      %25 = llvm.add %21, %24  : i32
+      %26 = llvm.mul %6, %2  : i32
+      %27 = llvm.add %25, %26  : i32
+      %28 = llvm.mul %6, %2  : i32
+      %29 = llvm.add %23, %28  : i32
+      %30 = arith.index_cast %27 : i32 to index
+      %31 = arith.index_cast %29 : i32 to index
+      %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+      vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+      %34 = llvm.mul %5, %2  : i32
+      %35 = llvm.add %25, %34  : i32
+      %36 = llvm.mul %6, %2  : i32
+      %37 = llvm.add %23, %36  : i32
+      %38 = arith.index_cast %35 : i32 to index
+      %39 = arith.index_cast %37 : i32 to index
+      %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+      vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+      gpu.return
+    }
+  }
+}
+
+
+// -----// IR Dump After ConvertNVVMToLLVMPass (convert-nvvm-to-llvm) //----- //
+module {
+  gpu.module @asd {
+    gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+      %0 = llvm.mlir.constant(32 : i32) : i32
+      %1 = llvm.mlir.constant(16 : i32) : i32
+      %2 = llvm.mlir.constant(8 : i32) : i32
+      %3 = llvm.mlir.constant(4 : i32) : i32
+      %4 = llvm.mlir.constant(2 : i32) : i32
+      %5 = llvm.mlir.constant(1 : i32) : i32
+      %6 = llvm.mlir.constant(0 : i32) : i32
+      memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+      %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+      %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)> 
+      %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;", ""  : () -> ()
+      %17 = llvm.mlir.constant(1 : i32) : i32
+      %18 = llvm.mlir.constant(1 : i32) : i32
+      %19 = llvm.mlir.constant(1 : i32) : i32
+      %20 = llvm.mlir.constant(0 : i32) : i32
+      %21 = llvm.mlir.constant(1 : i32) : i32
+      %22 = llvm.extractvalue %16[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %23 = llvm.extractvalue %16[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %24 = llvm.extractvalue %16[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %25 = llvm.extractvalue %16[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %26 = llvm.extractvalue %16[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %27 = llvm.extractvalue %16[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %28 = llvm.extractvalue %16[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %29 = llvm.extractvalue %16[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> 
+      %30 = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $18, 0;\0Awgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 {$0, $1, $2, $3, $4, $5, $6, $7}, $16, $17, p, $19,  $20, $21,  $22;\0A}\0A", "=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,l,l,n,n,n,n,n" %22, %23, %24, %25, %26, %27, %28, %29, %arg1, %...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/78556