[Mlir-commits] [mlir] 63897a5 - [MLIR][SROA] Replace pattern based approach with a one-shot one (#85437)

Mon Mar 18 00:33:12 PDT 2024

Author: Christian Ulmann
Date: 2024-03-18T08:33:09+01:00
New Revision: 63897a595a75e9dfc3313610432171f44287db11

URL: https://github.com/llvm/llvm-project/commit/63897a595a75e9dfc3313610432171f44287db11
DIFF: https://github.com/llvm/llvm-project/commit/63897a595a75e9dfc3313610432171f44287db11.diff

LOG: [MLIR][SROA] Replace pattern based approach with a one-shot one (#85437)

This commit changes MLIR's SROA implementation back from being pattern
based into a full pass. This is beneficial for upcoming changes that
rely more heavily on the datalayout.

Unfortunately, this change required substantial test changes, as the
IRBuilder no cleans up the IR.

Added: 
    

Modified: 
    mlir/include/mlir/Transforms/SROA.h
    mlir/lib/Transforms/SROA.cpp
    mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Transforms/SROA.h b/mlir/include/mlir/Transforms/SROA.h
index 0b3e72400c28733..1af1fe930723f1c 100644

--- a/mlir/include/mlir/Transforms/SROA.h
+++ b/mlir/include/mlir/Transforms/SROA.h
@@ -9,11 +9,9 @@
 #ifndef MLIR_TRANSFORMS_SROA_H
 #define MLIR_TRANSFORMS_SROA_H
 
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Statistic.h"
-#include <variant>
 
 namespace mlir {
 
@@ -29,24 +27,6 @@ struct SROAStatistics {
   llvm::Statistic *maxSubelementAmount = nullptr;
 };
 
-/// Pattern applying SROA to the regions of the operations on which it
-/// matches.
-class SROAPattern
-    : public OpInterfaceRewritePattern<DestructurableAllocationOpInterface> {
-public:
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  SROAPattern(MLIRContext *context, SROAStatistics statistics = {},
-              PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
-
-  LogicalResult matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  SROAStatistics statistics;
-};
-
 /// Attempts to destructure the slots of destructurable allocators. Returns
 /// failure if no slot was destructured.
 LogicalResult tryToDestructureMemorySlots(

diff  --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 3ceda51e1c894c7..6111489bdebefd4 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Transforms/SROA.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 namespace mlir {
@@ -205,13 +204,6 @@ LogicalResult mlir::tryToDestructureMemorySlots(
   return success(destructuredAny);
 }
 
-LogicalResult
-SROAPattern::matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                             PatternRewriter &rewriter) const {
-  hasBoundedRewriteRecursion();
-  return tryToDestructureMemorySlots({allocator}, rewriter, statistics);
-}
-
 namespace {
 
 struct SROA : public impl::SROABase<SROA> {
@@ -223,12 +215,35 @@ struct SROA : public impl::SROABase<SROA> {
     SROAStatistics statistics{&destructuredAmount, &slotsWithMemoryBenefit,
                               &maxSubelementAmount};
 
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<SROAPattern>(&getContext(), statistics);
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+    bool changed = false;
+
+    for (Region &region : scopeOp->getRegions()) {
+      if (region.getBlocks().empty())
+        continue;
 
-    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen)))
-      signalPassFailure();
+      OpBuilder builder(&region.front(), region.front().begin());
+      IRRewriter rewriter(builder);
+
+      // Destructuring a slot can allow for further destructuring of other
+      // slots, destructuring is tried until no destructuring succeeds.
+      while (true) {
+        SmallVector<DestructurableAllocationOpInterface> allocators;
+        // Build a list of allocators to attempt to destructure the slots of.
+        // TODO: Update list on the fly to avoid repeated visiting of the same
+        // allocators.
+        region.walk([&](DestructurableAllocationOpInterface allocator) {
+          allocators.emplace_back(allocator);
+        });
+
+        if (failed(
+                tryToDestructureMemorySlots(allocators, rewriter, statistics)))
+          break;
+
+        changed = true;
+      }
+    }
+    if (!changed)
+      markAllAnalysesPreserved();
   }
 };
 

diff  --git a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
index c2e3458134ba4b6..ba73025814cc054 100644
--- a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
@@ -146,12 +146,10 @@ llvm.func @invalid_indirect_memset() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_double_use
 llvm.func @memset_double_use() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -159,8 +157,11 @@ llvm.func @memset_double_use() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // We expect two generated memset, one for each field.
   // CHECK-NOT: "llvm.intr.memset"
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   %2 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f32)>
@@ -208,13 +209,10 @@ llvm.func @memset_considers_alignment() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_considers_packing
 llvm.func @memset_considers_packing() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
-  // CHECK-DAG: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i8, i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -222,7 +220,10 @@ llvm.func @memset_considers_packing() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // Now all fields are touched by the memset.
   // CHECK-NOT: "llvm.intr.memset"
+  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_WHOLE]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_PARTIAL]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
@@ -241,14 +242,14 @@ llvm.func @memset_considers_packing() -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memcpy_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -261,9 +262,8 @@ llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -271,14 +271,18 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%other_array, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32
@@ -289,14 +293,19 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
 
 // CHECK-LABEL: llvm.func @memcpy_double
 llvm.func @memcpy_double() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-COUNT-2: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // TODO: This should also disappear as a GEP with all zero indices should be
+  // ignored.
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<1 x i32>
   %1 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
   %2 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
+  // Match the dead constant, to avoid collision with the newly created one.
+  // CHECK: llvm.mlir.constant
   %memcpy_len = llvm.mlir.constant(4 : i32) : i32
   // CHECK-NOT: "llvm.intr.memcpy"
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memcpy"
   "llvm.intr.memcpy"(%1, %2, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
@@ -346,14 +355,14 @@ llvm.func @memcpy_no_volatile(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memmove_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memmove"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%1, %other_array, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -366,9 +375,7 @@ llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -376,14 +383,18 @@ llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%other_array, %1, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32