[Mlir-commits] [mlir] [llvm] [mlir] Lower math dialect later in gpu-lower-to-nvvm-pipeline (PR #78556)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Jan 18 01:49:50 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-gpu
Author: Guray Ozen (grypp)
<details>
<summary>Changes</summary>
This PR moves lowering of math dialect later in the pipeline. Because math dialect is lowered correctly by `createConvertGpuOpsToNVVMOps` for GPU target, and it needs to run it first.
---
Patch is 89.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78556.diff
3 Files Affected:
- (modified) mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp (+1-1)
- (added) mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir (+18)
- (added) stmatrix/everything-casted.mlir (+1267)
``````````diff
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 0b4739214bf2f1..935f0deaf9c8a6 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -51,7 +51,6 @@ void buildCommonPassPipeline(
pm.addPass(createConvertVectorToSCFPass());
pm.addPass(createConvertSCFToCFPass());
pm.addPass(createConvertNVVMToLLVMPass());
- pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createConvertFuncToLLVMPass());
pm.addPass(memref::createExpandStridedMetadataPass());
@@ -98,6 +97,7 @@ void buildHostPostPipeline(OpPassManager &pm,
GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+ pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
new file mode 100644
index 00000000000000..7c55059f4a84dc
--- /dev/null
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=isa" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: llvm.func @test_math(%arg0: f32) {
+func.func @test_math(%arg0 : f32) {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: gpu.launch_func @test_math_kernel::@test_math_kernel
+ // CHECK: gpu.binary @test_math_kernel [#gpu.object<#nvvm.target
+ gpu.launch
+ blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
+ threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
+ %s1 = math.exp %arg0 : f32
+ gpu.printf "%f" %s1 : f32
+ gpu.printf "Hello from %d\n" %6 : index
+ gpu.terminator
+ }
+ return
+}
\ No newline at end of file
diff --git a/stmatrix/everything-casted.mlir b/stmatrix/everything-casted.mlir
new file mode 100644
index 00000000000000..28ebad10c6a3ce
--- /dev/null
+++ b/stmatrix/everything-casted.mlir
@@ -0,0 +1,1267 @@
+// -----// IR Dump After ConvertNVGPUToNVVMPass (convert-nvgpu-to-nvvm) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %1 = llvm.mlir.constant(0 : i32) : i32
+ %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+ %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+ nvvm.wgmma.fence.aligned
+ %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %20 = llvm.mlir.constant(1 : i32) : i32
+ %21 = llvm.mlir.constant(2 : i32) : i32
+ %22 = llvm.mlir.constant(4 : i32) : i32
+ %23 = llvm.mlir.constant(8 : i32) : i32
+ %24 = llvm.mlir.constant(16 : i32) : i32
+ %25 = llvm.mlir.constant(32 : i32) : i32
+ %26 = nvvm.read.ptx.sreg.tid.x : i32
+ %27 = llvm.urem %26, %25 : i32
+ %28 = llvm.udiv %26, %25 : i32
+ %29 = llvm.udiv %27, %22 : i32
+ %30 = llvm.urem %27, %22 : i32
+ %31 = llvm.mul %30, %21 : i32
+ %32 = llvm.mul %28, %24 : i32
+ %33 = llvm.add %29, %32 : i32
+ %34 = llvm.mlir.constant(0 : i32) : i32
+ %35 = llvm.mul %34, %23 : i32
+ %36 = llvm.add %33, %35 : i32
+ %37 = llvm.mlir.constant(0 : i32) : i32
+ %38 = llvm.mul %37, %23 : i32
+ %39 = llvm.add %31, %38 : i32
+ %40 = arith.index_cast %36 : i32 to index
+ %41 = arith.index_cast %39 : i32 to index
+ %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+ vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+ %44 = llvm.mlir.constant(1 : i32) : i32
+ %45 = llvm.mul %44, %23 : i32
+ %46 = llvm.add %33, %45 : i32
+ %47 = llvm.mlir.constant(0 : i32) : i32
+ %48 = llvm.mul %47, %23 : i32
+ %49 = llvm.add %31, %48 : i32
+ %50 = arith.index_cast %46 : i32 to index
+ %51 = arith.index_cast %49 : i32 to index
+ %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+ vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After GpuKernelOutlining (gpu-kernel-outlining) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = builtin.unrealized_conversion_cast %arg0 : memref<64x32xf16, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %1 = llvm.mlir.constant(0 : i32) : i32
+ %2 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %3 = llvm.extractvalue %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %4 = llvm.insertvalue %1, %3[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %6 = llvm.insertvalue %1, %5[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %7 = llvm.insertvalue %1, %6[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %8 = llvm.insertvalue %1, %7[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %9 = llvm.insertvalue %1, %8[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %1, %9[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %1, %10[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %11, %2[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %13 = builtin.unrealized_conversion_cast %arg1 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<64x16xf16, 3>>
+ %14 = builtin.unrealized_conversion_cast %arg2 : i64 to !nvgpu.warpgroup.descriptor<tensor = memref<16x32xf16, 3>>
+ nvvm.wgmma.fence.aligned
+ %15 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %16 = llvm.extractvalue %12[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %18 = llvm.insertvalue %17, %15[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %19 = llvm.extractvalue %18[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %20 = llvm.mlir.constant(1 : i32) : i32
+ %21 = llvm.mlir.constant(2 : i32) : i32
+ %22 = llvm.mlir.constant(4 : i32) : i32
+ %23 = llvm.mlir.constant(8 : i32) : i32
+ %24 = llvm.mlir.constant(16 : i32) : i32
+ %25 = llvm.mlir.constant(32 : i32) : i32
+ %26 = nvvm.read.ptx.sreg.tid.x : i32
+ %27 = llvm.urem %26, %25 : i32
+ %28 = llvm.udiv %26, %25 : i32
+ %29 = llvm.udiv %27, %22 : i32
+ %30 = llvm.urem %27, %22 : i32
+ %31 = llvm.mul %30, %21 : i32
+ %32 = llvm.mul %28, %24 : i32
+ %33 = llvm.add %29, %32 : i32
+ %34 = llvm.mlir.constant(0 : i32) : i32
+ %35 = llvm.mul %34, %23 : i32
+ %36 = llvm.add %33, %35 : i32
+ %37 = llvm.mlir.constant(0 : i32) : i32
+ %38 = llvm.mul %37, %23 : i32
+ %39 = llvm.add %31, %38 : i32
+ %40 = arith.index_cast %36 : i32 to index
+ %41 = arith.index_cast %39 : i32 to index
+ %42 = llvm.extractvalue %19[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %43 = llvm.bitcast %42 : i32 to vector<2xf16>
+ vector.store %43, %arg0[%40, %41] : memref<64x32xf16, 3>, vector<2xf16>
+ %44 = llvm.mlir.constant(1 : i32) : i32
+ %45 = llvm.mul %44, %23 : i32
+ %46 = llvm.add %33, %45 : i32
+ %47 = llvm.mlir.constant(0 : i32) : i32
+ %48 = llvm.mul %47, %23 : i32
+ %49 = llvm.add %31, %48 : i32
+ %50 = arith.index_cast %46 : i32 to index
+ %51 = arith.index_cast %49 : i32 to index
+ %52 = llvm.extractvalue %19[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %53 = llvm.bitcast %52 : i32 to vector<2xf16>
+ vector.store %53, %arg0[%50, %51] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After ConvertVectorToSCF (convert-vector-to-scf) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.fence.aligned
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %18 = nvvm.read.ptx.sreg.tid.x : i32
+ %19 = llvm.urem %18, %0 : i32
+ %20 = llvm.udiv %18, %0 : i32
+ %21 = llvm.udiv %19, %3 : i32
+ %22 = llvm.urem %19, %3 : i32
+ %23 = llvm.mul %22, %4 : i32
+ %24 = llvm.mul %20, %1 : i32
+ %25 = llvm.add %21, %24 : i32
+ %26 = llvm.mul %6, %2 : i32
+ %27 = llvm.add %25, %26 : i32
+ %28 = llvm.mul %6, %2 : i32
+ %29 = llvm.add %23, %28 : i32
+ %30 = arith.index_cast %27 : i32 to index
+ %31 = arith.index_cast %29 : i32 to index
+ %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+ vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+ %34 = llvm.mul %5, %2 : i32
+ %35 = llvm.add %25, %34 : i32
+ %36 = llvm.mul %6, %2 : i32
+ %37 = llvm.add %23, %36 : i32
+ %38 = arith.index_cast %35 : i32 to index
+ %39 = arith.index_cast %37 : i32 to index
+ %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+ vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.fence.aligned
+ %17 = nvvm.wgmma.mma_async %arg1, %arg2, %16, <m = 64, n = 32, k = 16>, D[<f16>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)> -> !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ nvvm.wgmma.commit.group.sync.aligned
+ nvvm.wgmma.wait.group.sync.aligned 1
+ %18 = nvvm.read.ptx.sreg.tid.x : i32
+ %19 = llvm.urem %18, %0 : i32
+ %20 = llvm.udiv %18, %0 : i32
+ %21 = llvm.udiv %19, %3 : i32
+ %22 = llvm.urem %19, %3 : i32
+ %23 = llvm.mul %22, %4 : i32
+ %24 = llvm.mul %20, %1 : i32
+ %25 = llvm.add %21, %24 : i32
+ %26 = llvm.mul %6, %2 : i32
+ %27 = llvm.add %25, %26 : i32
+ %28 = llvm.mul %6, %2 : i32
+ %29 = llvm.add %23, %28 : i32
+ %30 = arith.index_cast %27 : i32 to index
+ %31 = arith.index_cast %29 : i32 to index
+ %32 = llvm.extractvalue %17[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %33 = llvm.bitcast %32 : i32 to vector<2xf16>
+ vector.store %33, %arg0[%30, %31] : memref<64x32xf16, 3>, vector<2xf16>
+ %34 = llvm.mul %5, %2 : i32
+ %35 = llvm.add %25, %34 : i32
+ %36 = llvm.mul %6, %2 : i32
+ %37 = llvm.add %23, %36 : i32
+ %38 = arith.index_cast %35 : i32 to index
+ %39 = arith.index_cast %37 : i32 to index
+ %40 = llvm.extractvalue %17[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %41 = llvm.bitcast %40 : i32 to vector<2xf16>
+ vector.store %41, %arg0[%38, %39] : memref<64x32xf16, 3>, vector<2xf16>
+ gpu.return
+ }
+ }
+}
+
+
+// -----// IR Dump After ConvertNVVMToLLVMPass (convert-nvvm-to-llvm) //----- //
+module {
+ gpu.module @asd {
+ gpu.func @foo(%arg0: memref<64x32xf16, 3>, %arg1: i64, %arg2: i64) {
+ %0 = llvm.mlir.constant(32 : i32) : i32
+ %1 = llvm.mlir.constant(16 : i32) : i32
+ %2 = llvm.mlir.constant(8 : i32) : i32
+ %3 = llvm.mlir.constant(4 : i32) : i32
+ %4 = llvm.mlir.constant(2 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ memref.assume_alignment %arg0, 32 : memref<64x32xf16, 3>
+ %7 = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %8 = llvm.extractvalue %7[0] : !llvm.struct<(struct<(i32, i32, i32, i32, i32, i32, i32, i32)>)>
+ %9 = llvm.insertvalue %6, %8[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %10 = llvm.insertvalue %6, %9[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %11 = llvm.insertvalue %6, %10[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %12 = llvm.insertvalue %6, %11[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %13 = llvm.insertvalue %6, %12[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %14 = llvm.insertvalue %6, %13[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %15 = llvm.insertvalue %6, %14[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %16 = llvm.insertvalue %6, %15[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ llvm.inline_asm has_side_effects asm_dialect = att "wgmma.fence.sync.aligned;", "" : () -> ()
+ %17 = llvm.mlir.constant(1 : i32) : i32
+ %18 = llvm.mlir.constant(1 : i32) : i32
+ %19 = llvm.mlir.constant(1 : i32) : i32
+ %20 = llvm.mlir.constant(0 : i32) : i32
+ %21 = llvm.mlir.constant(1 : i32) : i32
+ %22 = llvm.extractvalue %16[0] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %23 = llvm.extractvalue %16[1] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %24 = llvm.extractvalue %16[2] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %25 = llvm.extractvalue %16[3] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %26 = llvm.extractvalue %16[4] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %27 = llvm.extractvalue %16[5] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %28 = llvm.extractvalue %16[6] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %29 = llvm.extractvalue %16[7] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+ %30 = llvm.inline_asm has_side_effects asm_dialect = att "{\0A.reg .pred p;\0Asetp.ne.b32 p, $18, 0;\0Awgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 {$0, $1, $2, $3, $4, $5, $6, $7}, $16, $17, p, $19, $20, $21, $22;\0A}\0A", "=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,l,l,n,n,n,n,n" %22, %23, %24, %25, %26, %27, %28, %29, %arg1, %...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/78556
More information about the Mlir-commits
mailing list