[Mlir-commits] [mlir] [MLIR][XeVM] Remove xevm to llvm from convert to llvm (PR #175672)

Tue Jan 27 15:49:17 PST 2026

https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/175672

>From a465a8891e9905bb74666d9f433ed826aa82b505 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 9 Jan 2026 11:44:44 -0800
Subject: [PATCH 01/10] [MLIR][XeVM] Unregister convert xevm to llvm patterns
 from convert to llvm interface. convert xevm to llvm is target specific pass
 tied to SPIRV OpenCL kernels. As such, conversion patterns should be part of
 the generic convert to llvm pass.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 28 -------------------
 .../GPU/Pipelines/GPUToXeVMPipeline.cpp       |  2 ++
 mlir/lib/RegisterAllExtensions.cpp            |  2 --
 3 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 20a420dfda65c..79e256346574b 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 
-#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -881,28 +880,6 @@ struct ConvertXeVMToLLVMPass
 };
 } // namespace
 
-//===----------------------------------------------------------------------===//
-// ConvertToLLVMPatternInterface implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Implement the interface to convert XeVM to LLVM.
-struct XeVMToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
-  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
-  void loadDependentDialects(MLIRContext *context) const final {
-    context->loadDialect<LLVM::LLVMDialect>();
-  }
-
-  /// Hook for derived dialect interface to provide conversion patterns
-  /// and mark dialect legal for the conversion target.
-  void populateConvertToLLVMConversionPatterns(
-      ConversionTarget &target, LLVMTypeConverter &typeConverter,
-      RewritePatternSet &patterns) const final {
-    populateXeVMToLLVMConversionPatterns(target, patterns);
-  }
-};
-} // namespace
-
 //===----------------------------------------------------------------------===//
 // Pattern Population
 //===----------------------------------------------------------------------===//
@@ -938,8 +915,3 @@ void ::mlir::populateXeVMToLLVMConversionPatterns(ConversionTarget &target,
       patterns.getContext());
 }
 
-void ::mlir::registerConvertXeVMToLLVMInterface(DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
-    dialect->addInterfaces<XeVMToLLVMDialectInterface>();
-  });
-}
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 38313dc3c01d5..3cee74a06e81f 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
@@ -113,6 +114,7 @@ void buildPostGPUCommonPassPipeline(
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertToLLVMPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index 4312100a0c0b0..7850d303c1283 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -32,7 +32,6 @@
 #include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
-#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 #include "mlir/Dialect/AMX/Transforms.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.h"
@@ -93,7 +92,6 @@ void mlir::registerAllExtensions(DialectRegistry &registry) {
   gpu::registerConvertGpuToLLVMInterface(registry);
   NVVM::registerConvertGpuToNVVMInterface(registry);
   vector::registerConvertVectorToLLVMInterface(registry);
-  registerConvertXeVMToLLVMInterface(registry);
 
   // Register all transform dialect extensions.
   affine::registerTransformDialectExtension(registry);

>From 831db027ddb314af4c86956bbc55df6d2785d3b2 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Mon, 12 Jan 2026 10:58:11 -0800
Subject: [PATCH 02/10] Add pattern to hoist extract contiguous slice like
 shufflevector.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 103 ++++++++++++++++++
 .../Conversion/XeVMToLLVM/xevm-to-llvm.mlir   |  14 +--
 2 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 79e256346574b..a4e6ad1c2c5b8 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -18,6 +18,7 @@
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -857,6 +858,99 @@ class SubgroupOpWorkitemOpToOCLPattern : public OpConversionPattern<OpType> {
   }
 };
 
+static bool isExtractingContiguousSlice(LLVM::ShuffleVectorOp op) {
+  if (op.getV1() != op.getV2())
+    return false;
+  auto maskAttr = op.getMask();
+  int64_t firstIndex = maskAttr[0];
+  for (int64_t i = 1; i < static_cast<int64_t>(maskAttr.size()); ++i) {
+    int64_t index = maskAttr[i];
+    if (index != firstIndex + i)
+      return false;
+  }
+  return true;
+}
+
+// Input vector of a shuffle vector op extracting a contiguous slice is an
+// illegal vector in SPIRV kernel if the vector size is > 16 elements.
+// To legalize this case, keep applying the following transformations until no
+// more match:
+//   1. keep hoisting the shuffle vector op past unary element-wise operations
+//       start with fpext, fptrunc and bitcast for now.
+//   2. merge with another shuffle vector op
+//   3. merge with load as a smaller load
+class HandleVectorExtractPattern
+    : public OpRewritePattern<LLVM::ShuffleVectorOp> {
+  using OpRewritePattern<LLVM::ShuffleVectorOp>::OpRewritePattern;
+
+  void initialize() { setHasBoundedRewriteRecursion(); }
+
+  LogicalResult matchAndRewrite(LLVM::ShuffleVectorOp op,
+                                PatternRewriter &rewriter) const override {
+
+    if (!isExtractingContiguousSlice(op))
+      return failure();
+
+    auto mask = op.getMask();
+    auto loc = op.getLoc();
+    auto ty = op.getType();
+    // Check source operand to determine rewrite pattern.
+    auto src = op.getV1();
+    // 1. Hoist past unary element-wise operations
+    if (auto srcOp = src.getDefiningOp()) {
+      if (isa<LLVM::FPExtOp>(srcOp) || isa<LLVM::FPTruncOp>(srcOp) ||
+          isa<LLVM::BitcastOp>(srcOp)) {
+        Value srcInput = srcOp->getOperand(0);
+        // Create new shuffle vector op with unary input as source.
+        auto srcVecTy = dyn_cast<VectorType>(srcInput.getType());
+        auto newShuffleVecTy =
+            VectorType::get(mask.size(), srcVecTy.getElementType());
+        auto newShuffle = LLVM::ShuffleVectorOp::create(
+            rewriter, loc, newShuffleVecTy, srcInput, srcInput, mask);
+        // Create new unary op with new shuffle as input.
+        Value newUnaryOp;
+        if (isa<LLVM::FPExtOp>(srcOp)) {
+          newUnaryOp = LLVM::FPExtOp::create(rewriter, loc, ty, newShuffle);
+        } else if (isa<LLVM::FPTruncOp>(srcOp)) {
+          newUnaryOp = LLVM::FPTruncOp::create(rewriter, loc, ty, newShuffle);
+        } else if (isa<LLVM::BitcastOp>(srcOp)) {
+          newUnaryOp = LLVM::BitcastOp::create(rewriter, loc, ty, newShuffle);
+        }
+        rewriter.replaceOp(op, newUnaryOp);
+      } else if (isa<LLVM::ShuffleVectorOp>(srcOp)) {
+        // 2. Merge with another shuffle vector op
+        auto srcShuffle = cast<LLVM::ShuffleVectorOp>(srcOp);
+        auto srcMask = srcShuffle.getMask();
+        SmallVector<int32_t> combinedMask;
+        for (auto index : mask) {
+          combinedMask.push_back(srcMask[index]);
+        }
+        auto newShuffle = LLVM::ShuffleVectorOp::create(
+            rewriter, loc, ty, srcShuffle.getV1(), srcShuffle.getV1(),
+            DenseI32ArrayAttr::get(rewriter.getContext(), combinedMask));
+        rewriter.replaceOp(op, newShuffle);
+      } else if (auto loadOp = src.getDefiningOp<LLVM::LoadOp>()) {
+        // 3. Merge with load as a smaller load
+        auto loadPtr = loadOp.getAddr();
+        auto loadTy = dyn_cast<VectorType>(loadOp.getType());
+        auto elemTy = loadTy.getElementType();
+        auto firstIndex = mask[0];
+        auto newVecTy = VectorType::get(mask.size(), elemTy);
+        auto newPtr = LLVM::GEPOp::create(
+            rewriter, loc,
+            LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                       loadPtr.getType().getAddressSpace()),
+            elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
+        auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
+        rewriter.replaceOp(op, newLoad);
+      } else {
+        return failure();
+      }
+    }
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
@@ -876,6 +970,15 @@ struct ConvertXeVMToLLVMPass
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
+
+    // Apply in-dialect lowerings to handle illegal vectors
+    {
+      RewritePatternSet vectorPatterns(&getContext());
+      vectorPatterns.add<HandleVectorExtractPattern>(&getContext());
+      if (failed(
+              applyPatternsGreedily(getOperation(), std::move(vectorPatterns))))
+        signalPassFailure();
+    }
   }
 };
 } // namespace
diff --git a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
index 7f01526cb0a06..06a0ff5e7484b 100644
--- a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
+++ b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
@@ -1,21 +1,17 @@
 // RUN: mlir-opt --convert-xevm-to-llvm --split-input-file %s | FileCheck %s
 
-// Same below, but using the `ConvertToLLVMPatternInterface` entry point
-// and the generic `convert-to-llvm` pass.
-// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
-
 // CHECK-LABEL:      llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
 // CHECK-SAME:   !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
 // CHECK-SAME:   !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
 // CHECK:      llvm.func @blockload2d(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi16> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -51,12 +47,12 @@ llvm.func @blockload2d_cache_control(%a: !llvm.ptr<1>, %base_width_a: i32, %base
 // CHECK:      llvm.func @blockload2d_v_blocks(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_v_blocks(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<16xi16> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -80,12 +76,12 @@ llvm.func @blockload2d_v_blocks(%a: !llvm.ptr<1>, %base_width_a: i32, %base_heig
 // CHECK:      llvm.func @blockload2d_pack_register(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_pack_register(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_16r16x1cPU3AS1viiiDv2_iPj(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -109,12 +105,12 @@ llvm.func @blockload2d_pack_register(%a: !llvm.ptr<1>, %base_width_a: i32, %base
 // CHECK:      llvm.func @blockload2d_transpose(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_transpose(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -138,12 +134,12 @@ llvm.func @blockload2d_transpose(%a: !llvm.ptr<1>, %base_width_a: i32, %base_hei
 // CHECK: llvm.func @blockstore2d(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME: %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32, %[[ARG6:.*]]: vector<8xi32>) {
 llvm.func @blockstore2d(%c: !llvm.ptr<1>, %base_width_c: i32, %base_height_c: i32, %base_pitch_c: i32, %x: i32, %y: i32, %c_result_casted: vector<8xi32>) {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.store %[[ARG6]], %[[VAR6]] : vector<8xi32>, !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(

>From dd38883efd06fd47a3210f854590bfd5a74e8869 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 13 Jan 2026 09:37:02 -0800
Subject: [PATCH 03/10] Fix format issue.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index a4e6ad1c2c5b8..2710e8aa0e595 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -1017,4 +1017,3 @@ void ::mlir::populateXeVMToLLVMConversionPatterns(ConversionTarget &target,
                SubgroupOpWorkitemOpToOCLPattern<SubgroupSizeOp>>(
       patterns.getContext());
 }
-

>From 803c9ee29b782c438bd2947f0df08914312a3af7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 14 Jan 2026 15:05:30 -0800
Subject: [PATCH 04/10] GPU XeVM pipline: Invoke xevm to llvm just before gpu
 module to binary.

---
 mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 3cee74a06e81f..43886f2013a28 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -114,10 +114,10 @@ void buildPostGPUCommonPassPipeline(
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertToLLVMPass());
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   // gpu-module-to-binary
   {
     GpuModuleToBinaryPassOptions gpuToModuleBinOptions;

>From e81b7bcc98140c2af82b19d47e9aa00a2a437c1c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 15 Jan 2026 08:57:46 -0800
Subject: [PATCH 05/10] Disable folding.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 2710e8aa0e595..280139359f715 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -936,13 +936,19 @@ class HandleVectorExtractPattern
         auto elemTy = loadTy.getElementType();
         auto firstIndex = mask[0];
         auto newVecTy = VectorType::get(mask.size(), elemTy);
-        auto newPtr = LLVM::GEPOp::create(
-            rewriter, loc,
-            LLVM::LLVMPointerType::get(rewriter.getContext(),
-                                       loadPtr.getType().getAddressSpace()),
-            elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
-        auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
-        rewriter.replaceOp(op, newLoad);
+        // GEPOp is needed if first index is not zero
+        if (firstIndex) {
+          auto newPtr = LLVM::GEPOp::create(
+              rewriter, loc,
+              LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                         loadPtr.getType().getAddressSpace()),
+              elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
+          auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
+          rewriter.replaceOp(op, newLoad);
+        } else {
+          auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, loadPtr);
+          rewriter.replaceOp(op, newLoad);
+        }
       } else {
         return failure();
       }
@@ -975,8 +981,10 @@ struct ConvertXeVMToLLVMPass
     {
       RewritePatternSet vectorPatterns(&getContext());
       vectorPatterns.add<HandleVectorExtractPattern>(&getContext());
-      if (failed(
-              applyPatternsGreedily(getOperation(), std::move(vectorPatterns))))
+      GreedyRewriteConfig config{};
+      config.enableFolding(false);
+      if (failed(applyPatternsGreedily(getOperation(),
+                                       std::move(vectorPatterns), config)))
         signalPassFailure();
     }
   }

>From fc923b6388ef54f8444a921a75447a36046d2140 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 23 Jan 2026 09:06:57 -0800
Subject: [PATCH 06/10] Add test case.

---
 .../XeVMToLLVM/legalize_large_vector.mlir     | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir

diff --git a/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir b/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir
new file mode 100644
index 0000000000000..150f6dba5cbaa
--- /dev/null
+++ b/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt --convert-xevm-to-llvm --split-input-file %s | FileCheck %s
+
+module @test_illegal_vector {
+  // CHECK-LABEL: llvm.func @test_illegal_vector
+  // CHECK: %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr, %[[ARG4:.*]]: !llvm.ptr
+  llvm.func @test_illegal_vector(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr, %arg4: !llvm.ptr) {
+    // CHECK: %[[LOAD0:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST0:.*]] = llvm.bitcast %[[LOAD0]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT0:.*]] = llvm.fpext %[[BITCAST0]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ARG0]][8] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD1:.*]] = llvm.load %[[GEP0]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST1:.*]] = llvm.bitcast %[[LOAD1]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT1:.*]] = llvm.fpext %[[BITCAST1]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[BITCAST2:.*]] = llvm.bitcast %[[FPEXT0]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST2]], %[[ARG1]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[BITCAST3:.*]] = llvm.bitcast %[[FPEXT1]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST3]], %[[ARG2]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[ARG0]][16] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD2:.*]] = llvm.load %[[GEP1]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST4:.*]] = llvm.bitcast %[[LOAD2]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT2:.*]] = llvm.fpext %[[BITCAST4]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[GEP2:.*]] = llvm.getelementptr %[[ARG0]][24] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD3:.*]] = llvm.load %[[GEP2]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST5:.*]] = llvm.bitcast %[[LOAD3]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT3:.*]] = llvm.fpext %[[BITCAST5]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[BITCAST6:.*]] = llvm.bitcast %[[FPEXT2]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST6]], %[[ARG3]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[BITCAST7:.*]] = llvm.bitcast %[[FPEXT3]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST7]], %[[ARG4]] : vector<8xi32>, !llvm.ptr
+    // CHECK: llvm.return
+      %0 = llvm.load %arg0 : !llvm.ptr -> vector<32xi16>
+      %1 = llvm.bitcast %0 : vector<32xi16> to vector<32xf16>
+      %2 = llvm.shufflevector %1, %1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf16>
+      %3 = llvm.shufflevector %1, %1 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf16>
+      %4 = llvm.fpext %2 : vector<16xf16> to vector<16xf32>
+      %5 = llvm.fpext %3 : vector<16xf16> to vector<16xf32>
+      %6 = llvm.shufflevector %4, %4 [0, 1, 2, 3, 4, 5, 6, 7] : vector<16xf32>
+      %7 = llvm.shufflevector %4, %4 [8, 9, 10, 11, 12, 13, 14, 15] : vector<16xf32>
+      %8 = llvm.bitcast %6 : vector<8xf32> to vector<8xi32>
+      llvm.store %8, %arg1 : vector<8xi32>, !llvm.ptr
+      %9 = llvm.bitcast %7 : vector<8xf32> to vector<8xi32>
+      llvm.store %9, %arg2 : vector<8xi32>, !llvm.ptr
+      %10 = llvm.shufflevector %5, %5 [0, 1, 2, 3, 4, 5, 6, 7] : vector<16xf32>
+      %11 = llvm.shufflevector %5, %5 [8, 9, 10, 11, 12, 13, 14, 15] : vector<16xf32>
+      %12 = llvm.bitcast %10 : vector<8xf32> to vector<8xi32>
+      llvm.store %12, %arg3 : vector<8xi32>, !llvm.ptr
+      %13 = llvm.bitcast %11 : vector<8xf32> to vector<8xi32>
+      llvm.store %13, %arg4 : vector<8xi32>, !llvm.ptr
+      llvm.return
+  }
+}

>From 8681cab5e8124a07f8871eb91af71a4d7bcc3d99 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 23 Jan 2026 16:10:22 -0800
Subject: [PATCH 07/10] Add in code comment as requested by reviewers.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 280139359f715..00fb4a2a88492 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -982,6 +982,10 @@ struct ConvertXeVMToLLVMPass
       RewritePatternSet vectorPatterns(&getContext());
       vectorPatterns.add<HandleVectorExtractPattern>(&getContext());
       GreedyRewriteConfig config{};
+      // folding can remove ops with temporary attributes used to
+      // represent LLVM metadata, so disable it here.
+      // Effectively just this single pattern is applied without any
+      // op folding patterns from dialects.
       config.enableFolding(false);
       if (failed(applyPatternsGreedily(getOperation(),
                                        std::move(vectorPatterns), config)))

>From 8b3852eccc3f7128dac105e422c737e6e589cf44 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Mon, 26 Jan 2026 11:48:22 -0800
Subject: [PATCH 08/10] Unify conditionals.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 00fb4a2a88492..3c23476ff2e81 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -929,8 +929,9 @@ class HandleVectorExtractPattern
             rewriter, loc, ty, srcShuffle.getV1(), srcShuffle.getV1(),
             DenseI32ArrayAttr::get(rewriter.getContext(), combinedMask));
         rewriter.replaceOp(op, newShuffle);
-      } else if (auto loadOp = src.getDefiningOp<LLVM::LoadOp>()) {
+      } else if (isa<LLVM::LoadOp>(srcOp)) {
         // 3. Merge with load as a smaller load
+        auto loadOp = cast<LLVM::LoadOp>(srcOp);
         auto loadPtr = loadOp.getAddr();
         auto loadTy = dyn_cast<VectorType>(loadOp.getType());
         auto elemTy = loadTy.getElementType();

>From 19f84a35ee09a221aa1954f7c78adcdbccd940bd Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 27 Jan 2026 21:04:38 +0000
Subject: [PATCH 09/10] Fix bitcast handling issue and pass greedy rewriter
 failure.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 47 +++++++++++++++----
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 3c23476ff2e81..fced01686b0ba 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -898,8 +898,7 @@ class HandleVectorExtractPattern
     auto src = op.getV1();
     // 1. Hoist past unary element-wise operations
     if (auto srcOp = src.getDefiningOp()) {
-      if (isa<LLVM::FPExtOp>(srcOp) || isa<LLVM::FPTruncOp>(srcOp) ||
-          isa<LLVM::BitcastOp>(srcOp)) {
+      if (isa<LLVM::FPExtOp>(srcOp) || isa<LLVM::FPTruncOp>(srcOp)) {
         Value srcInput = srcOp->getOperand(0);
         // Create new shuffle vector op with unary input as source.
         auto srcVecTy = dyn_cast<VectorType>(srcInput.getType());
@@ -911,12 +910,43 @@ class HandleVectorExtractPattern
         Value newUnaryOp;
         if (isa<LLVM::FPExtOp>(srcOp)) {
           newUnaryOp = LLVM::FPExtOp::create(rewriter, loc, ty, newShuffle);
-        } else if (isa<LLVM::FPTruncOp>(srcOp)) {
+        } else {
           newUnaryOp = LLVM::FPTruncOp::create(rewriter, loc, ty, newShuffle);
-        } else if (isa<LLVM::BitcastOp>(srcOp)) {
-          newUnaryOp = LLVM::BitcastOp::create(rewriter, loc, ty, newShuffle);
         }
         rewriter.replaceOp(op, newUnaryOp);
+      } else if (isa<LLVM::BitcastOp>(srcOp)) {
+        Value srcInput = srcOp->getOperand(0);
+        // Create new shuffle vector op with unary input as source.
+        auto srcInputVecTy = dyn_cast<VectorType>(srcInput.getType());
+        auto srcInputSize = srcInputVecTy.getNumElements();
+        auto srcResVecTy = dyn_cast<VectorType>(srcOp->getResult(0).getType());
+        auto srcResSize = srcResVecTy.getNumElements();
+        auto maskSize = static_cast<int32_t>(mask.size());
+        if (srcInputSize > srcResSize) {
+          return failure();
+        }
+        if (srcResSize % srcInputSize != 0) {
+          return failure();
+        }
+        auto maskScale = srcResSize / srcInputSize;
+        if (maskScale != 1) {
+          // Create a new mask that maps to the source vector
+          SmallVector<int32_t> newMask;
+          int32_t newMaskSize = maskSize / maskScale;
+          int32_t maskStart = mask[0] / maskScale;
+          for (int32_t i = 0; i < newMaskSize; ++i) {
+            newMask.push_back(maskStart + i);
+          }
+          mask = newMask;
+        }
+        auto newShuffleVecTy =
+            VectorType::get(srcInputSize, srcInputVecTy.getElementType());
+        auto newShuffle = LLVM::ShuffleVectorOp::create(
+            rewriter, loc, newShuffleVecTy, srcInput, srcInput, mask);
+        // Create new unary op with new shuffle as input.
+        auto newBitcast =
+            LLVM::BitcastOp::create(rewriter, loc, ty, newShuffle);
+        rewriter.replaceOp(op, newBitcast);
       } else if (isa<LLVM::ShuffleVectorOp>(srcOp)) {
         // 2. Merge with another shuffle vector op
         auto srcShuffle = cast<LLVM::ShuffleVectorOp>(srcOp);
@@ -988,9 +1018,10 @@ struct ConvertXeVMToLLVMPass
       // Effectively just this single pattern is applied without any
       // op folding patterns from dialects.
       config.enableFolding(false);
-      if (failed(applyPatternsGreedily(getOperation(),
-                                       std::move(vectorPatterns), config)))
-        signalPassFailure();
+      // config.setMaxIterations(GreedyRewriteConfig::kNoLimit);
+      // config.setMaxNumRewrites(GreedyRewriteConfig::kNoLimit);
+      (void)applyPatternsGreedily(getOperation(), std::move(vectorPatterns),
+                                  config);
     }
   }
 };

>From bad9437ff45caf73475697b010121f40a4b47ede Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 27 Jan 2026 23:48:55 +0000
Subject: [PATCH 10/10] Check mask value when scaling.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index fced01686b0ba..dccc027913b8c 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -930,6 +930,9 @@ class HandleVectorExtractPattern
         }
         auto maskScale = srcResSize / srcInputSize;
         if (maskScale != 1) {
+          if (mask[0] % maskScale != 0) {
+            return failure();
+          }
           // Create a new mask that maps to the source vector
           SmallVector<int32_t> newMask;
           int32_t newMaskSize = maskSize / maskScale;