[Mlir-commits] [mlir] Add E2E test cases to test fastmath attribute. (PR #165423)

Thu Oct 30 08:35:01 PDT 2025

https://github.com/mshahneo updated https://github.com/llvm/llvm-project/pull/165423

>From 14dbc7519da688f2b8d75eb0c6260bd8988c6d3f Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Tue, 28 Oct 2025 15:56:37 +0000
Subject: [PATCH 1/2] Add E2E test cases to test fastmath attribute.

Ops tested:
- math.exp
- arith.maximumf
---
 .../Dialect/XeGPU/WG/arith_ops.mlir           | 116 ++++++++++++++++++
 .../Dialect/XeGPU/WG/math_exp.mlir            | 109 ++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
 create mode 100644 mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir

diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
new file mode 100644
index 0000000000000..fc1fa0104b6ec
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
@@ -0,0 +1,116 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=workgroup" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#map = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>
+module @gemm attributes {gpu.container_module} {
+  func.func @test_fast_math(%input1: memref<256x256xf32>, %input2: memref<256x256xf32>) -> (memref<256x256xf32>, memref<256x256xf32>) attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c512 = arith.constant 512 : index
+    %input1_gpu = gpu.alloc () : memref<256x256xf32>
+    gpu.memcpy %input1_gpu, %input2 : memref<256x256xf32>, memref<256x256xf32>
+    %input2_gpu = gpu.alloc () : memref<256x256xf32>
+    gpu.memcpy %input2_gpu, %input2 : memref<256x256xf32>, memref<256x256xf32>
+    %result_gpu = gpu.alloc () : memref<256x256xf32>
+    %result_gpu_with_fastmath = gpu.alloc () : memref<256x256xf32>
+    // NOTE: Here we can't use [8, 64] wi threads following
+    // the SG thread layout of [8, 4]. Because runtime will linearize
+    // the x dimension first (we need y dimension to be linearized first).
+    // So just use linearized thread layout of [512, 1] wi threads.
+    gpu.launch_func  @math_kernels::@gpu_maximumf blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+    gpu.launch_func  @math_kernels::@gpu_maximumf_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+
+    %result_host = memref.alloc() : memref<256x256xf32>
+    %result_host_with_fastmath = memref.alloc() : memref<256x256xf32>
+    gpu.memcpy %result_host, %result_gpu : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %input_gpu : memref<256x256xf32>
+    gpu.dealloc %result_gpu : memref<256x256xf32>
+    return %result_host, %result_host_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
+  }
+
+  gpu.module @math_kernels   {
+    gpu.func @gpu_maximumf(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
+      %c256 = arith.constant 256 : index
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c256 : index
+      %n = arith.muli %block_id_y, %c256 : index
+      %input_tdesc_1 = xegpu.create_nd_tdesc %input1_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val_1 = xegpu.load_nd %input_tdesc_1[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %input_tdesc_2 = xegpu.create_nd_tdesc %input2_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val_2 = xegpu.load_nd %input_tdesc_2[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %result_val = arith.maximumf %input_val_1, %input_val_2 : vector<256x256xf32>
+      %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
+      gpu.return
+    }
+
+    // Kernel with fastmath attribute
+    gpu.func @gpu_maximumf_with_fastmath(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
+      %c256 = arith.constant 256 : index
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c256 : index
+      %n = arith.muli %block_id_y, %c256 : index
+      %input_tdesc_1 = xegpu.create_nd_tdesc %input1_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val_1 = xegpu.load_nd %input_tdesc_1[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %input_tdesc_2 = xegpu.create_nd_tdesc %input2_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val_2 = xegpu.load_nd %input_tdesc_2[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %result_val = arith.maximumf %input_val_1, %input_val_2 fastmath<fast> : vector<256x256xf32>
+      %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
+      gpu.return
+    }
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2_f32 = arith.constant 2.2 : f32
+    %c256 = arith.constant 256 : index
+    %input_1 = memref.alloc() : memref<256x256xf32>
+    %input_2 = memref.alloc() : memref<256x256xf32>
+    %cpu_ref_result = memref.alloc() : memref<256x256xf32>
+
+    scf.for %arg0 = %c0 to %c256 step %c1 {
+      scf.for %arg1 = %c0 to %c256 step %c1 {
+        memref.store %c2_f32, %input_1[%arg0, %arg1] : memref<256x256xf32>
+        memref.store %c2_f32, %input_2[%arg0, %arg1] : memref<256x256xf32>
+      }
+    }
+
+    // Run CPU version
+    scf.for %arg0 = %c0 to %c256 step %c1 {
+      scf.for %arg1 = %c0 to %c256 step %c1 {
+        %val_1 = memref.load %input_1[%arg0, %arg1] : memref<256x256xf32>
+        %val_2 = memref.load %input_2[%arg0, %arg1] : memref<256x256xf32>
+        %res_val = arith.maximumf %val_1, %val_2 : f32
+        memref.store %res_val, %cpu_ref_result[%arg0, %arg1] : memref<256x256xf32>
+      }
+    }
+
+    // Run GPU version.
+    %gpu_result, %gpu_result_fastmath = call @test_fast_math(%input_1, %input_2) : (memref<256x256xf32>, memref<256x256xf32>) -> (memref<256x256xf32>, memref<256x256xf32>)
+    %gpu_result_cast = memref.cast %gpu_result : memref<256x256xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-COUNT-256: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,   100,   101,   102,   103,   104,   105,   106,   107,   108,   109,   110,   111,   112,   113,   114,   115,   116,   117,   118,   119,   120,   121,   122,   123,   124,   125,   126,   127,   128,   129,   130,   131,   132,   133,   134,   135,   136,   137,   138,   139,   140,   141,   142,   143,   144,   145,   146,   147,   148,   149,   150,   151,   152,   153,   154,   155,   156,   157,   158,   159,   160,   161,   162,   163,   164,   165,   166,   167,   168,   169,   170,   171,   172,   173,   174,   175,   176,   177,   178,   179,   180,   181,   182,   183,   184,   185,   186,   187,   188,   189,   190,   191,   192,   193,   194,   195,   196,   197,   198,   199,   200,   201,   202,   203,   204,   205,   206,   207,   208,   209,   210,   211,   212,   213,   214,   215,   216,   217,   218,   219,   220,   221,   222,   223,   224,   225,   226,   227,   228,   229,   230,   231,   232,   233,   234,   235,   236,   237,   238,   239,   240,   241,   242,   243,   244,   245,   246,   247,   248,   249,   250,   251,   252,   253,   254,   255]
+    call @printMemrefF32(%gpu_result_cast) : (memref<*xf32>) -> ()
+
+    memref.dealloc %input_1 : memref<256x256xf32>
+    memref.dealloc %input_2 : memref<256x256xf32>
+    memref.dealloc %cpu_ref_result : memref<256x256xf32>
+    memref.dealloc %gpu_result : memref<256x256xf32>
+    memref.dealloc %gpu_result_fastmath : memref<256x256xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir
new file mode 100644
index 0000000000000..abbd4bd1422f6
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=workgroup" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#map = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>
+module @gemm attributes {gpu.container_module} {
+  func.func @test_fast_math(%input: memref<256x256xf32>) -> (memref<256x256xf32>, memref<256x256xf32>) attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c512 = arith.constant 512 : index
+    %input_gpu = gpu.alloc () : memref<256x256xf32>
+    gpu.memcpy %input_gpu, %input : memref<256x256xf32>, memref<256x256xf32>
+    %result_gpu = gpu.alloc () : memref<256x256xf32>
+    %result_gpu_with_fastmath = gpu.alloc () : memref<256x256xf32>
+    // NOTE: Here we can't use [8, 64] wi threads following
+    // the SG thread layout of [8, 4]. Because runtime will linearize
+    // the x dimension first (we need y dimension to be linearized first).
+    // So just use linearized thread layout of [512, 1] wi threads.
+    gpu.launch_func  @math_kernels::@gpu_exp blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+    gpu.launch_func  @math_kernels::@gpu_exp_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu_with_fastmath : memref<256x256xf32>)
+
+    %result_host = memref.alloc() : memref<256x256xf32>
+    %result_host_with_fastmath = memref.alloc() : memref<256x256xf32>
+    gpu.memcpy %result_host, %result_gpu : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %input_gpu : memref<256x256xf32>
+    gpu.dealloc %result_gpu : memref<256x256xf32>
+    return %result_host, %result_host_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
+  }
+
+  gpu.module @math_kernels   {
+    gpu.func @gpu_exp(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
+      %c256 = arith.constant 256 : index
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c256 : index
+      %n = arith.muli %block_id_y, %c256 : index
+      %input_tdesc = xegpu.create_nd_tdesc %input_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val = xegpu.load_nd %input_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %result_val = math.exp %input_val : vector<256x256xf32>
+      %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
+      gpu.return
+    }
+
+    // Kernel with fastmath attribute
+    gpu.func @gpu_exp_with_fastmath(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
+      %c256 = arith.constant 256 : index
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c256 : index
+      %n = arith.muli %block_id_y, %c256 : index
+      %input_tdesc = xegpu.create_nd_tdesc %input_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_val = xegpu.load_nd %input_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
+      %result_val = math.exp %input_val fastmath<fast> : vector<256x256xf32>
+      %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
+      gpu.return
+    }
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2_f32 = arith.constant 2.2 : f32
+    %c256 = arith.constant 256 : index
+    %input = memref.alloc() : memref<256x256xf32>
+    %input_ref = memref.alloc() : memref<256x256xf32>
+    %cpu_ref_result = memref.alloc() : memref<256x256xf32>
+
+    scf.for %arg0 = %c0 to %c256 step %c1 {
+      scf.for %arg1 = %c0 to %c256 step %c1 {
+        memref.store %c2_f32, %input[%arg0, %arg1] : memref<256x256xf32>
+        memref.store %c2_f32, %input_ref[%arg0, %arg1] : memref<256x256xf32>
+      }
+    }
+
+    // Run CPU version
+    scf.for %arg0 = %c0 to %c256 step %c1 {
+      scf.for %arg1 = %c0 to %c256 step %c1 {
+        %val = memref.load %input_ref[%arg0, %arg1] : memref<256x256xf32>
+        %res_val = math.exp %val : f32
+        memref.store %res_val, %cpu_ref_result[%arg0, %arg1] : memref<256x256xf32>
+      }
+    }
+
+    // Run GPU version.
+    %gpu_result, %gpu_result_fastmath = call @test_fast_math(%input) : (memref<256x256xf32>) -> (memref<256x256xf32>, memref<256x256xf32>)
+    %gpu_result_cast = memref.cast %gpu_result : memref<256x256xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-COUNT-256: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,   100,   101,   102,   103,   104,   105,   106,   107,   108,   109,   110,   111,   112,   113,   114,   115,   116,   117,   118,   119,   120,   121,   122,   123,   124,   125,   126,   127,   128,   129,   130,   131,   132,   133,   134,   135,   136,   137,   138,   139,   140,   141,   142,   143,   144,   145,   146,   147,   148,   149,   150,   151,   152,   153,   154,   155,   156,   157,   158,   159,   160,   161,   162,   163,   164,   165,   166,   167,   168,   169,   170,   171,   172,   173,   174,   175,   176,   177,   178,   179,   180,   181,   182,   183,   184,   185,   186,   187,   188,   189,   190,   191,   192,   193,   194,   195,   196,   197,   198,   199,   200,   201,   202,   203,   204,   205,   206,   207,   208,   209,   210,   211,   212,   213,   214,   215,   216,   217,   218,   219,   220,   221,   222,   223,   224,   225,   226,   227,   228,   229,   230,   231,   232,   233,   234,   235,   236,   237,   238,   239,   240,   241,   242,   243,   244,   245,   246,   247,   248,   249,   250,   251,   252,   253,   254,   255]
+    call @printMemrefF32(%gpu_result_cast) : (memref<*xf32>) -> ()
+
+    memref.dealloc %input : memref<256x256xf32>
+    memref.dealloc %input_ref : memref<256x256xf32>
+    memref.dealloc %cpu_ref_result : memref<256x256xf32>
+    memref.dealloc %gpu_result : memref<256x256xf32>
+    memref.dealloc %gpu_result_fastmath : memref<256x256xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}

>From 36f193212846750f7d3a9b1935fc05c10987cc62 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Thu, 30 Oct 2025 15:34:13 +0000
Subject: [PATCH 2/2] Fix a issue.

Currently, only one kernel per-module is supported.
---
 .../Dialect/XeGPU/WG/arith_ops.mlir           | 16 +++++++++-----
 .../Dialect/XeGPU/WG/math_exp.mlir            | 22 ++++++++++++-------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
index fc1fa0104b6ec..c3d9c69d26b4b 100644
--- a/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/arith_ops.mlir
@@ -26,18 +26,20 @@ module @gemm attributes {gpu.container_module} {
     // the SG thread layout of [8, 4]. Because runtime will linearize
     // the x dimension first (we need y dimension to be linearized first).
     // So just use linearized thread layout of [512, 1] wi threads.
-    gpu.launch_func  @math_kernels::@gpu_maximumf blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
-    gpu.launch_func  @math_kernels::@gpu_maximumf_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+    gpu.launch_func  @arith_maximumf_module::@gpu_maximumf blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+    gpu.launch_func  @arith_maximumf_fastmath_module::@gpu_maximumf_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu_with_fastmath : memref<256x256xf32>)
 
     %result_host = memref.alloc() : memref<256x256xf32>
     %result_host_with_fastmath = memref.alloc() : memref<256x256xf32>
     gpu.memcpy %result_host, %result_gpu : memref<256x256xf32>, memref<256x256xf32>
-    gpu.dealloc %input_gpu : memref<256x256xf32>
+    gpu.memcpy %result_host_with_fastmath, %result_gpu_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %input1_gpu : memref<256x256xf32>
+    gpu.dealloc %input2_gpu : memref<256x256xf32>
     gpu.dealloc %result_gpu : memref<256x256xf32>
     return %result_host, %result_host_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
   }
 
-  gpu.module @math_kernels   {
+  gpu.module @arith_maximumf_module {
     gpu.func @gpu_maximumf(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
       %c256 = arith.constant 256 : index
       %block_id_x = gpu.block_id x
@@ -48,12 +50,14 @@ module @gemm attributes {gpu.container_module} {
       %input_val_1 = xegpu.load_nd %input_tdesc_1[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
       %input_tdesc_2 = xegpu.create_nd_tdesc %input2_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       %input_val_2 = xegpu.load_nd %input_tdesc_2[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
-      %result_val = arith.maximumf %input_val_1, %input_val_2 : vector<256x256xf32>
+      %result_val = arith.maximumf %input_val_1, %input_val_2 {layout_result_0 = #map} : vector<256x256xf32>
       %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
       gpu.return
     }
+  }
 
+  gpu.module @arith_maximumf_fastmath_module {
     // Kernel with fastmath attribute
     gpu.func @gpu_maximumf_with_fastmath(%input1_gpu : memref<256x256xf32>, %input2_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
       %c256 = arith.constant 256 : index
@@ -65,7 +69,7 @@ module @gemm attributes {gpu.container_module} {
       %input_val_1 = xegpu.load_nd %input_tdesc_1[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
       %input_tdesc_2 = xegpu.create_nd_tdesc %input2_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       %input_val_2 = xegpu.load_nd %input_tdesc_2[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
-      %result_val = arith.maximumf %input_val_1, %input_val_2 fastmath<fast> : vector<256x256xf32>
+      %result_val = arith.maximumf %input_val_1, %input_val_2 fastmath<fast> {layout_result_0 = #map} : vector<256x256xf32>
       %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
       gpu.return
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir
index abbd4bd1422f6..91a058c8ffe8f 100644
--- a/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/math_exp.mlir
@@ -24,18 +24,20 @@ module @gemm attributes {gpu.container_module} {
     // the SG thread layout of [8, 4]. Because runtime will linearize
     // the x dimension first (we need y dimension to be linearized first).
     // So just use linearized thread layout of [512, 1] wi threads.
-    gpu.launch_func  @math_kernels::@gpu_exp blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
-    gpu.launch_func  @math_kernels::@gpu_exp_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu_with_fastmath : memref<256x256xf32>)
+    gpu.launch_func  @math_exp_module::@gpu_exp blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>)
+    gpu.launch_func  @math_exp_fastmath_module::@gpu_exp_with_fastmath blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%input_gpu : memref<256x256xf32>, %result_gpu_with_fastmath : memref<256x256xf32>)
 
     %result_host = memref.alloc() : memref<256x256xf32>
     %result_host_with_fastmath = memref.alloc() : memref<256x256xf32>
     gpu.memcpy %result_host, %result_gpu : memref<256x256xf32>, memref<256x256xf32>
+    gpu.memcpy %result_host_with_fastmath, %result_gpu_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
     gpu.dealloc %input_gpu : memref<256x256xf32>
     gpu.dealloc %result_gpu : memref<256x256xf32>
+    gpu.dealloc %result_gpu_with_fastmath : memref<256x256xf32>
     return %result_host, %result_host_with_fastmath : memref<256x256xf32>, memref<256x256xf32>
   }
 
-  gpu.module @math_kernels   {
+  gpu.module @math_exp_module   {
     gpu.func @gpu_exp(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
       %c256 = arith.constant 256 : index
       %block_id_x = gpu.block_id x
@@ -44,23 +46,27 @@ module @gemm attributes {gpu.container_module} {
       %n = arith.muli %block_id_y, %c256 : index
       %input_tdesc = xegpu.create_nd_tdesc %input_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       %input_val = xegpu.load_nd %input_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
-      %result_val = math.exp %input_val : vector<256x256xf32>
+      %result_val = math.exp %input_val {layout_result_0 = #map} : vector<256x256xf32>
       %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
       gpu.return
     }
 
+
+  }
+
+  gpu.module @math_exp_fastmath_module {
     // Kernel with fastmath attribute
-    gpu.func @gpu_exp_with_fastmath(%input_gpu : memref<256x256xf32>, %result_gpu : memref<256x256xf32>) kernel  {
+    gpu.func @gpu_exp_with_fastmath(%input_gpu_with_fast_math : memref<256x256xf32>, %result_gpu_with_fastmath : memref<256x256xf32>) kernel  {
       %c256 = arith.constant 256 : index
       %block_id_x = gpu.block_id x
       %block_id_y = gpu.block_id y
       %m = arith.muli %block_id_x, %c256 : index
       %n = arith.muli %block_id_y, %c256 : index
-      %input_tdesc = xegpu.create_nd_tdesc %input_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %input_tdesc = xegpu.create_nd_tdesc %input_gpu_with_fast_math : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       %input_val = xegpu.load_nd %input_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #map> -> vector<256x256xf32>
-      %result_val = math.exp %input_val fastmath<fast> : vector<256x256xf32>
-      %result_tdesc = xegpu.create_nd_tdesc %result_gpu : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
+      %result_val = math.exp %input_val fastmath<fast> {layout_result_0 = #map} : vector<256x256xf32>
+      %result_tdesc = xegpu.create_nd_tdesc %result_gpu_with_fastmath : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #map>
       xegpu.store_nd %result_val, %result_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #map>
       gpu.return
     }