[Mlir-commits] [mlir] 97a6aa8 - [MLIR][XeGPU][GPU] Optimize GPU to XeVM pipeline (#184711)

Tue Mar 10 10:52:20 PDT 2026

Author: Sang Ik Lee
Date: 2026-03-10T10:52:15-07:00
New Revision: 97a6aa83cab5dd4a34a3f1ecaa34b909cf7c5cce

URL: https://github.com/llvm/llvm-project/commit/97a6aa83cab5dd4a34a3f1ecaa34b909cf7c5cce
DIFF: https://github.com/llvm/llvm-project/commit/97a6aa83cab5dd4a34a3f1ecaa34b909cf7c5cce.diff

LOG: [MLIR][XeGPU][GPU] Optimize GPU to XeVM pipeline (#184711)

Some XeGPU transforms can generate code sequences that can simplified by
folding. But full canonicalization is not required. As an alternative,
remove canonicalize from some parts of the pipeline where only folding
is needed and add folding at the end of XeGPU blocking pass
and XeGPU peephole optimize pass.

Added: 
    

Modified: 
    mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index ba2601038acb0..2c0346f4b2d56 100644

--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -78,7 +78,6 @@ void buildGPUPassPipeline(OpPassManager &pm,
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(instDataOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking());
-    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
   }
   if (options.xegpuOpLevel == "subgroup" ||
@@ -86,7 +85,6 @@ void buildGPUPassPipeline(OpPassManager &pm,
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(laneLayoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPeepHoleOptimizer());
-    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(laneLayoutOptions));

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index b815950361b04..1ee0bc6ad9507 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -381,6 +381,11 @@ void XeGPUBlockingPass::runOnOperation() {
   populateXeGPUUnrollPatterns(patterns, options);
   vector::populateVectorUnrollPatterns(patterns, vectorOptions);
 
+  // Note: The pattern driver does op folding as well and clean up.
+  // But intermediate insert/extract strided slice ops with
+  // unrealized conversion cast ops in the middle does not get
+  // cleaned up in this step. One more round of folding is needed
+  // after the walk to resolve those unrealized conversion cast ops.
   (void)applyPatternsGreedily(op, std::move(patterns));
 
   op->walk([](Operation *op) {
@@ -405,4 +410,9 @@ void XeGPUBlockingPass::runOnOperation() {
     if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
       resolveUnrealizedConversionCastOp(castOp);
   });
+
+  // One more round of folding to clean up the intermediate
+  // insert/extract strided slice ops.
+  RewritePatternSet emptyPatterns(ctx);
+  (void)applyPatternsGreedily(op, std::move(emptyPatterns));
 }

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 3b3b11cebe213..d7a9b7ba377f9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -25,6 +25,7 @@
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>
@@ -583,6 +584,11 @@ struct XeGPUPeepHoleOptimizerPass final
       DBGS() << "Optimize block loads pass failed.\n";
       return signalPassFailure();
     }
+
+    // Apply folding for cleaning up IR.
+    MLIRContext *ctx = &getContext();
+    RewritePatternSet emptyPatterns(ctx);
+    (void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
   }
 };