[Mlir-commits] [mlir] [MLIR][XeVM] Remove xevm to llvm from convert to llvm (PR #175672)

Fri Jan 23 09:10:40 PST 2026

https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/175672

>From a465a8891e9905bb74666d9f433ed826aa82b505 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 9 Jan 2026 11:44:44 -0800
Subject: [PATCH 1/6] [MLIR][XeVM] Unregister convert xevm to llvm patterns
 from convert to llvm interface. convert xevm to llvm is target specific pass
 tied to SPIRV OpenCL kernels. As such, conversion patterns should be part of
 the generic convert to llvm pass.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 28 -------------------
 .../GPU/Pipelines/GPUToXeVMPipeline.cpp       |  2 ++
 mlir/lib/RegisterAllExtensions.cpp            |  2 --
 3 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 20a420dfda65c..79e256346574b 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 
-#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -881,28 +880,6 @@ struct ConvertXeVMToLLVMPass
 };
 } // namespace
 
-//===----------------------------------------------------------------------===//
-// ConvertToLLVMPatternInterface implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Implement the interface to convert XeVM to LLVM.
-struct XeVMToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
-  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
-  void loadDependentDialects(MLIRContext *context) const final {
-    context->loadDialect<LLVM::LLVMDialect>();
-  }
-
-  /// Hook for derived dialect interface to provide conversion patterns
-  /// and mark dialect legal for the conversion target.
-  void populateConvertToLLVMConversionPatterns(
-      ConversionTarget &target, LLVMTypeConverter &typeConverter,
-      RewritePatternSet &patterns) const final {
-    populateXeVMToLLVMConversionPatterns(target, patterns);
-  }
-};
-} // namespace
-
 //===----------------------------------------------------------------------===//
 // Pattern Population
 //===----------------------------------------------------------------------===//
@@ -938,8 +915,3 @@ void ::mlir::populateXeVMToLLVMConversionPatterns(ConversionTarget &target,
       patterns.getContext());
 }
 
-void ::mlir::registerConvertXeVMToLLVMInterface(DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
-    dialect->addInterfaces<XeVMToLLVMDialectInterface>();
-  });
-}
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 38313dc3c01d5..3cee74a06e81f 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
@@ -113,6 +114,7 @@ void buildPostGPUCommonPassPipeline(
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertToLLVMPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index 4312100a0c0b0..7850d303c1283 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -32,7 +32,6 @@
 #include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
-#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 #include "mlir/Dialect/AMX/Transforms.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.h"
@@ -93,7 +92,6 @@ void mlir::registerAllExtensions(DialectRegistry &registry) {
   gpu::registerConvertGpuToLLVMInterface(registry);
   NVVM::registerConvertGpuToNVVMInterface(registry);
   vector::registerConvertVectorToLLVMInterface(registry);
-  registerConvertXeVMToLLVMInterface(registry);
 
   // Register all transform dialect extensions.
   affine::registerTransformDialectExtension(registry);

>From 831db027ddb314af4c86956bbc55df6d2785d3b2 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Mon, 12 Jan 2026 10:58:11 -0800
Subject: [PATCH 2/6] Add pattern to hoist extract contiguous slice like
 shufflevector.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 103 ++++++++++++++++++
 .../Conversion/XeVMToLLVM/xevm-to-llvm.mlir   |  14 +--
 2 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 79e256346574b..a4e6ad1c2c5b8 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -18,6 +18,7 @@
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -857,6 +858,99 @@ class SubgroupOpWorkitemOpToOCLPattern : public OpConversionPattern<OpType> {
   }
 };
 
+static bool isExtractingContiguousSlice(LLVM::ShuffleVectorOp op) {
+  if (op.getV1() != op.getV2())
+    return false;
+  auto maskAttr = op.getMask();
+  int64_t firstIndex = maskAttr[0];
+  for (int64_t i = 1; i < static_cast<int64_t>(maskAttr.size()); ++i) {
+    int64_t index = maskAttr[i];
+    if (index != firstIndex + i)
+      return false;
+  }
+  return true;
+}
+
+// Input vector of a shuffle vector op extracting a contiguous slice is an
+// illegal vector in SPIRV kernel if the vector size is > 16 elements.
+// To legalize this case, keep applying the following transformations until no
+// more match:
+//   1. keep hoisting the shuffle vector op past unary element-wise operations
+//       start with fpext, fptrunc and bitcast for now.
+//   2. merge with another shuffle vector op
+//   3. merge with load as a smaller load
+class HandleVectorExtractPattern
+    : public OpRewritePattern<LLVM::ShuffleVectorOp> {
+  using OpRewritePattern<LLVM::ShuffleVectorOp>::OpRewritePattern;
+
+  void initialize() { setHasBoundedRewriteRecursion(); }
+
+  LogicalResult matchAndRewrite(LLVM::ShuffleVectorOp op,
+                                PatternRewriter &rewriter) const override {
+
+    if (!isExtractingContiguousSlice(op))
+      return failure();
+
+    auto mask = op.getMask();
+    auto loc = op.getLoc();
+    auto ty = op.getType();
+    // Check source operand to determine rewrite pattern.
+    auto src = op.getV1();
+    // 1. Hoist past unary element-wise operations
+    if (auto srcOp = src.getDefiningOp()) {
+      if (isa<LLVM::FPExtOp>(srcOp) || isa<LLVM::FPTruncOp>(srcOp) ||
+          isa<LLVM::BitcastOp>(srcOp)) {
+        Value srcInput = srcOp->getOperand(0);
+        // Create new shuffle vector op with unary input as source.
+        auto srcVecTy = dyn_cast<VectorType>(srcInput.getType());
+        auto newShuffleVecTy =
+            VectorType::get(mask.size(), srcVecTy.getElementType());
+        auto newShuffle = LLVM::ShuffleVectorOp::create(
+            rewriter, loc, newShuffleVecTy, srcInput, srcInput, mask);
+        // Create new unary op with new shuffle as input.
+        Value newUnaryOp;
+        if (isa<LLVM::FPExtOp>(srcOp)) {
+          newUnaryOp = LLVM::FPExtOp::create(rewriter, loc, ty, newShuffle);
+        } else if (isa<LLVM::FPTruncOp>(srcOp)) {
+          newUnaryOp = LLVM::FPTruncOp::create(rewriter, loc, ty, newShuffle);
+        } else if (isa<LLVM::BitcastOp>(srcOp)) {
+          newUnaryOp = LLVM::BitcastOp::create(rewriter, loc, ty, newShuffle);
+        }
+        rewriter.replaceOp(op, newUnaryOp);
+      } else if (isa<LLVM::ShuffleVectorOp>(srcOp)) {
+        // 2. Merge with another shuffle vector op
+        auto srcShuffle = cast<LLVM::ShuffleVectorOp>(srcOp);
+        auto srcMask = srcShuffle.getMask();
+        SmallVector<int32_t> combinedMask;
+        for (auto index : mask) {
+          combinedMask.push_back(srcMask[index]);
+        }
+        auto newShuffle = LLVM::ShuffleVectorOp::create(
+            rewriter, loc, ty, srcShuffle.getV1(), srcShuffle.getV1(),
+            DenseI32ArrayAttr::get(rewriter.getContext(), combinedMask));
+        rewriter.replaceOp(op, newShuffle);
+      } else if (auto loadOp = src.getDefiningOp<LLVM::LoadOp>()) {
+        // 3. Merge with load as a smaller load
+        auto loadPtr = loadOp.getAddr();
+        auto loadTy = dyn_cast<VectorType>(loadOp.getType());
+        auto elemTy = loadTy.getElementType();
+        auto firstIndex = mask[0];
+        auto newVecTy = VectorType::get(mask.size(), elemTy);
+        auto newPtr = LLVM::GEPOp::create(
+            rewriter, loc,
+            LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                       loadPtr.getType().getAddressSpace()),
+            elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
+        auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
+        rewriter.replaceOp(op, newLoad);
+      } else {
+        return failure();
+      }
+    }
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
@@ -876,6 +970,15 @@ struct ConvertXeVMToLLVMPass
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
+
+    // Apply in-dialect lowerings to handle illegal vectors
+    {
+      RewritePatternSet vectorPatterns(&getContext());
+      vectorPatterns.add<HandleVectorExtractPattern>(&getContext());
+      if (failed(
+              applyPatternsGreedily(getOperation(), std::move(vectorPatterns))))
+        signalPassFailure();
+    }
   }
 };
 } // namespace
diff --git a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
index 7f01526cb0a06..06a0ff5e7484b 100644
--- a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
+++ b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
@@ -1,21 +1,17 @@
 // RUN: mlir-opt --convert-xevm-to-llvm --split-input-file %s | FileCheck %s
 
-// Same below, but using the `ConvertToLLVMPatternInterface` entry point
-// and the generic `convert-to-llvm` pass.
-// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
-
 // CHECK-LABEL:      llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
 // CHECK-SAME:   !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
 // CHECK-SAME:   !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
 // CHECK:      llvm.func @blockload2d(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi16> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -51,12 +47,12 @@ llvm.func @blockload2d_cache_control(%a: !llvm.ptr<1>, %base_width_a: i32, %base
 // CHECK:      llvm.func @blockload2d_v_blocks(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_v_blocks(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<16xi16> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -80,12 +76,12 @@ llvm.func @blockload2d_v_blocks(%a: !llvm.ptr<1>, %base_width_a: i32, %base_heig
 // CHECK:      llvm.func @blockload2d_pack_register(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_pack_register(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_16r16x1cPU3AS1viiiDv2_iPj(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -109,12 +105,12 @@ llvm.func @blockload2d_pack_register(%a: !llvm.ptr<1>, %base_width_a: i32, %base
 // CHECK:      llvm.func @blockload2d_transpose(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
 llvm.func @blockload2d_transpose(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj(
   // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
@@ -138,12 +134,12 @@ llvm.func @blockload2d_transpose(%a: !llvm.ptr<1>, %base_width_a: i32, %base_hei
 // CHECK: llvm.func @blockstore2d(%[[ARG0:.*]]: !llvm.ptr<1>,
 // CHECK-SAME: %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32, %[[ARG6:.*]]: vector<8xi32>) {
 llvm.func @blockstore2d(%c: !llvm.ptr<1>, %base_width_c: i32, %base_height_c: i32, %base_pitch_c: i32, %x: i32, %y: i32, %c_result_casted: vector<8xi32>) {
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
   // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
   // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
-  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
   // CHECK: llvm.store %[[ARG6]], %[[VAR6]] : vector<8xi32>, !llvm.ptr
   // CHECK: llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(

>From dd38883efd06fd47a3210f854590bfd5a74e8869 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 13 Jan 2026 09:37:02 -0800
Subject: [PATCH 3/6] Fix format issue.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index a4e6ad1c2c5b8..2710e8aa0e595 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -1017,4 +1017,3 @@ void ::mlir::populateXeVMToLLVMConversionPatterns(ConversionTarget &target,
                SubgroupOpWorkitemOpToOCLPattern<SubgroupSizeOp>>(
       patterns.getContext());
 }
-

>From 803c9ee29b782c438bd2947f0df08914312a3af7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 14 Jan 2026 15:05:30 -0800
Subject: [PATCH 4/6] GPU XeVM pipline: Invoke xevm to llvm just before gpu
 module to binary.

---
 mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 3cee74a06e81f..43886f2013a28 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -114,10 +114,10 @@ void buildPostGPUCommonPassPipeline(
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertVectorToLLVMPass());
   pm.addPass(createConvertToLLVMPass());
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeVMToLLVMPass());
   // gpu-module-to-binary
   {
     GpuModuleToBinaryPassOptions gpuToModuleBinOptions;

>From e81b7bcc98140c2af82b19d47e9aa00a2a437c1c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 15 Jan 2026 08:57:46 -0800
Subject: [PATCH 5/6] Disable folding.

---
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 2710e8aa0e595..280139359f715 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -936,13 +936,19 @@ class HandleVectorExtractPattern
         auto elemTy = loadTy.getElementType();
         auto firstIndex = mask[0];
         auto newVecTy = VectorType::get(mask.size(), elemTy);
-        auto newPtr = LLVM::GEPOp::create(
-            rewriter, loc,
-            LLVM::LLVMPointerType::get(rewriter.getContext(),
-                                       loadPtr.getType().getAddressSpace()),
-            elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
-        auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
-        rewriter.replaceOp(op, newLoad);
+        // GEPOp is needed if first index is not zero
+        if (firstIndex) {
+          auto newPtr = LLVM::GEPOp::create(
+              rewriter, loc,
+              LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                         loadPtr.getType().getAddressSpace()),
+              elemTy, loadPtr, ArrayRef<LLVM::GEPArg>{firstIndex});
+          auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, newPtr);
+          rewriter.replaceOp(op, newLoad);
+        } else {
+          auto newLoad = LLVM::LoadOp::create(rewriter, loc, newVecTy, loadPtr);
+          rewriter.replaceOp(op, newLoad);
+        }
       } else {
         return failure();
       }
@@ -975,8 +981,10 @@ struct ConvertXeVMToLLVMPass
     {
       RewritePatternSet vectorPatterns(&getContext());
       vectorPatterns.add<HandleVectorExtractPattern>(&getContext());
-      if (failed(
-              applyPatternsGreedily(getOperation(), std::move(vectorPatterns))))
+      GreedyRewriteConfig config{};
+      config.enableFolding(false);
+      if (failed(applyPatternsGreedily(getOperation(),
+                                       std::move(vectorPatterns), config)))
         signalPassFailure();
     }
   }

>From fc923b6388ef54f8444a921a75447a36046d2140 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 23 Jan 2026 09:06:57 -0800
Subject: [PATCH 6/6] Add test case.

---
 .../XeVMToLLVM/legalize_large_vector.mlir     | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir

diff --git a/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir b/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir
new file mode 100644
index 0000000000000..150f6dba5cbaa
--- /dev/null
+++ b/mlir/test/Conversion/XeVMToLLVM/legalize_large_vector.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt --convert-xevm-to-llvm --split-input-file %s | FileCheck %s
+
+module @test_illegal_vector {
+  // CHECK-LABEL: llvm.func @test_illegal_vector
+  // CHECK: %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr, %[[ARG4:.*]]: !llvm.ptr
+  llvm.func @test_illegal_vector(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr, %arg4: !llvm.ptr) {
+    // CHECK: %[[LOAD0:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST0:.*]] = llvm.bitcast %[[LOAD0]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT0:.*]] = llvm.fpext %[[BITCAST0]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ARG0]][8] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD1:.*]] = llvm.load %[[GEP0]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST1:.*]] = llvm.bitcast %[[LOAD1]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT1:.*]] = llvm.fpext %[[BITCAST1]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[BITCAST2:.*]] = llvm.bitcast %[[FPEXT0]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST2]], %[[ARG1]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[BITCAST3:.*]] = llvm.bitcast %[[FPEXT1]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST3]], %[[ARG2]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[ARG0]][16] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD2:.*]] = llvm.load %[[GEP1]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST4:.*]] = llvm.bitcast %[[LOAD2]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT2:.*]] = llvm.fpext %[[BITCAST4]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[GEP2:.*]] = llvm.getelementptr %[[ARG0]][24] : (!llvm.ptr) -> !llvm.ptr, i16
+    // CHECK: %[[LOAD3:.*]] = llvm.load %[[GEP2]] : !llvm.ptr -> vector<8xi16>
+    // CHECK: %[[BITCAST5:.*]] = llvm.bitcast %[[LOAD3]] : vector<8xi16> to vector<8xf16>
+    // CHECK: %[[FPEXT3:.*]] = llvm.fpext %[[BITCAST5]] : vector<8xf16> to vector<8xf32>
+    // CHECK: %[[BITCAST6:.*]] = llvm.bitcast %[[FPEXT2]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST6]], %[[ARG3]] : vector<8xi32>, !llvm.ptr
+    // CHECK: %[[BITCAST7:.*]] = llvm.bitcast %[[FPEXT3]] : vector<8xf32> to vector<8xi32>
+    // CHECK: llvm.store %[[BITCAST7]], %[[ARG4]] : vector<8xi32>, !llvm.ptr
+    // CHECK: llvm.return
+      %0 = llvm.load %arg0 : !llvm.ptr -> vector<32xi16>
+      %1 = llvm.bitcast %0 : vector<32xi16> to vector<32xf16>
+      %2 = llvm.shufflevector %1, %1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf16>
+      %3 = llvm.shufflevector %1, %1 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf16>
+      %4 = llvm.fpext %2 : vector<16xf16> to vector<16xf32>
+      %5 = llvm.fpext %3 : vector<16xf16> to vector<16xf32>
+      %6 = llvm.shufflevector %4, %4 [0, 1, 2, 3, 4, 5, 6, 7] : vector<16xf32>
+      %7 = llvm.shufflevector %4, %4 [8, 9, 10, 11, 12, 13, 14, 15] : vector<16xf32>
+      %8 = llvm.bitcast %6 : vector<8xf32> to vector<8xi32>
+      llvm.store %8, %arg1 : vector<8xi32>, !llvm.ptr
+      %9 = llvm.bitcast %7 : vector<8xf32> to vector<8xi32>
+      llvm.store %9, %arg2 : vector<8xi32>, !llvm.ptr
+      %10 = llvm.shufflevector %5, %5 [0, 1, 2, 3, 4, 5, 6, 7] : vector<16xf32>
+      %11 = llvm.shufflevector %5, %5 [8, 9, 10, 11, 12, 13, 14, 15] : vector<16xf32>
+      %12 = llvm.bitcast %10 : vector<8xf32> to vector<8xi32>
+      llvm.store %12, %arg3 : vector<8xi32>, !llvm.ptr
+      %13 = llvm.bitcast %11 : vector<8xf32> to vector<8xi32>
+      llvm.store %13, %arg4 : vector<8xi32>, !llvm.ptr
+      llvm.return
+  }
+}