[llvm] [LoopIdiom] Perform loop versioning to use memcpy (PR #125043)

Thu Jan 30 01:55:40 PST 2025

https://github.com/kasuga-fj created https://github.com/llvm/llvm-project/pull/125043

LoopIdiomRecognize has given up on processing loops if an alias can exist for the target load and store. This patch introduces loop versioning within the LoopIdiomRecognize and improves the memcpy replacement in such cases. Currently, loop versioning is only performed for the loop that is dedicated to transferring content from a load to a store is supported. This limitatin prevents the code size from growing up.

Related to #50892

>From 549b8168aa15fafa5e61e83cf20b92e5b3efbc1f Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 24 Jan 2025 12:33:26 +0000
Subject: [PATCH] [LoopIdiom] Perform loop versioning to use memcpy

LoopIdiomRecognize has given up on processing loops if an alias can
exist for the target load and store. This patch introduces loop
versioning within the LoopIdiomRecognize and improves the memcpy
replacement in such cases. Currently, loop versioning is only performed
for the loop that is dedicated to transferring content from a load to a
store is supported. This limitatin prevents the code size from growing
up.

Related to #50892
---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 133 +++++++++--
 llvm/test/Transforms/LoopIdiom/basic.ll       |  30 +--
 .../expander-do-not-delete-reused-values.ll   | 112 +++++++---
 .../LoopIdiom/memcpy-loop-versioning.ll       | 210 ++++++++++++++++++
 llvm/test/Transforms/LoopIdiom/pr82337.ll     |   2 +-
 5 files changed, 431 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/memcpy-loop-versioning.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2462ec33e0c202..089a58998603d8 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -80,9 +80,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
@@ -132,6 +134,16 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableLoopVersioning(
+    "enable-" DEBUG_TYPE "-version",
+    cl::desc("Allow loop idiom recognize to version loop(s) when converting"),
+    cl::init(true), cl::ReallyHidden);
+
+static cl::opt<int> LoopVersioningLengthLimit(
+    DEBUG_TYPE "-lv-lenght-limit",
+    cl::desc("Lower length limit for loop versioning"), cl::init(12),
+    cl::ReallyHidden);
+
 namespace {
 
 class LoopIdiomRecognize {
@@ -146,6 +158,7 @@ class LoopIdiomRecognize {
   OptimizationRemarkEmitter &ORE;
   bool ApplyCodeSizeHeuristics;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
+  const LoopAccessInfo &LAI;
 
 public:
   explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
@@ -153,8 +166,10 @@ class LoopIdiomRecognize {
                               TargetLibraryInfo *TLI,
                               const TargetTransformInfo *TTI, MemorySSA *MSSA,
                               const DataLayout *DL,
-                              OptimizationRemarkEmitter &ORE)
-      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
+                              OptimizationRemarkEmitter &ORE,
+                              const LoopAccessInfo &LAI)
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE),
+        LAI(LAI) {
     if (MSSA)
       MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   }
@@ -220,6 +235,9 @@ class LoopIdiomRecognize {
                                   const SCEV *BECount);
   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                  bool IsLoopMemset = false);
+  bool shouldVersionLoopForMemCpy(Instruction *TheStore,
+                                  Instruction *TheLoad) const;
+  void versionLoop(const SCEV *BECount, SCEVExpander &Expander);
 
   /// @}
   /// \name Noncountable Loop Idiom Handling
@@ -264,8 +282,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
+  LoopAccessInfoManager LAIs(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, &AR.TLI);
   LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
-                         AR.MSSA, DL, ORE);
+                         AR.MSSA, DL, ORE, LAIs.getInfo(L));
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -1359,13 +1378,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   }
 
   bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
-  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
-
   if (IsAtomic) {
-    // For now don't support unordered atomic memmove.
-    if (UseMemMove)
-      return Changed;
-
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
     assert((StoreAlign && LoadAlign) &&
@@ -1381,14 +1394,29 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
       return Changed;
   }
 
-  if (UseMemMove)
-    if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
-                                             IsMemCpy))
-      return Changed;
-
   if (avoidLIRForMultiBlockLoop())
     return Changed;
 
+  bool MayOverlap = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
+  bool UseMemMove = false;
+
+  // First, see if it is possible to use memmove. If not, determine whether we
+  // should version the loops to replace the instructions with memcpy. If both
+  // are rejected, then bail out.
+  // TODO: It may be better to perform the versioning at first, then use memcpy
+  // in the versioned loop and memmove in the original loop.
+  if (MayOverlap) {
+    // For now don't support unordered atomic memmove.
+    if (!IsAtomic && Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride,
+                                                         *TheLoad, IsMemCpy)) {
+      UseMemMove = true;
+    } else if (shouldVersionLoopForMemCpy(TheStore, TheLoad)) {
+      versionLoop(BECount, Expander);
+    } else {
+      return Changed;
+    }
+  }
+
   // Okay, everything is safe, we can transform this!
 
   const SCEV *NumBytesS =
@@ -1486,6 +1514,83 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
   return false;
 }
 
+// Returns true if we should version the loop and make sure that there is no
+// alias between the store and the load. This allows us to use `memcpy` instead
+// of `memmove`. However, versioning increases the code size. In the worst case,
+// if there are multiple load/store pairs, the code size increases
+// exponentially. Therefore, versioning is supported only if the loop only does
+// transfers related to this store and load. That is, we will version the loop
+// as follows:
+//
+//   ```
+//   for (i=0; i<len; i++)
+//     dst[i] = src[i];
+//   ```
+//
+// But we don't want to do this if there are other processes inside the loop,
+// e.g.,
+//
+//   ```
+//   acc = 0;
+//   for (i=0; i<len; i++) {
+//     dst[i] = src[i];
+//     acc += ...;
+//   }
+//   ```
+bool LoopIdiomRecognize::shouldVersionLoopForMemCpy(
+    Instruction *TheStore, Instruction *TheLoad) const {
+  if (ApplyCodeSizeHeuristics || !EnableLoopVersioning)
+    return false;
+
+  // There are cases where the load and store always overlap. Avoid versioning
+  // in these situations.
+  auto *Checking = LAI.getRuntimePointerChecking();
+  if (Checking->getNumberOfChecks() == 0)
+    return false;
+
+  BasicBlock *Cur = TheStore->getParent();
+  for (auto &I : *Cur) {
+    if (I.isDebugOrPseudoInst() || I.isTerminator())
+      continue;
+
+    // If there is a memory instruction other then `TheStore` and `TheLoad`,
+    // then bail out.
+    if (I.mayReadOrWriteMemory() && (&I) != TheStore && (&I) != TheLoad)
+      return false;
+
+    // We also abandon the versioning if there is an instruction other than
+    // `TheStore`, `TheLoad`, and anything related to loop control.
+    for (const auto &U : I.uses()) {
+      const Instruction *UseI = cast<Instruction>(U.getUser());
+      if (UseI->getParent() != Cur)
+        return false;
+    }
+  }
+  return true;
+}
+
+void LoopIdiomRecognize::versionLoop(const SCEV *BECount,
+                                     SCEVExpander &Expander) {
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  LoopVersioning LVer(LAI, LAI.getRuntimePointerChecking()->getChecks(),
+                      CurLoop, LI, DT, SE);
+  LVer.versionLoop();
+
+  // If the loop trip count is small, the overhead of function calls may not be
+  // negligible. In this case, avoid function calls and run the loop as is.
+  BranchInst *Branch = cast<BranchInst>(Preheader->getTerminator());
+  if (!Branch)
+    return;
+  Type *IntTy = BECount->getType();
+  Value *Cond = Branch->getCondition();
+  Value *TripCount = Expander.expandCodeFor(BECount, IntTy, Branch);
+  IRBuilder<> Builder(Branch);
+  Value *BoundCond = Builder.CreateICmpSLT(
+      TripCount, ConstantInt::get(IntTy, LoopVersioningLengthLimit));
+  Value *NewCond = Builder.CreateOr(Cond, BoundCond);
+  Branch->setCondition(NewCond);
+}
+
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
                     << CurLoop->getHeader()->getParent()->getName()
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e6fc11625317b5..fada7a08473158 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -624,27 +624,27 @@ for.end:                                          ; preds = %for.body
 
 
 
-; PR9815 - This is a partial overlap case that cannot be safely transformed
-; into a memcpy.
+; This is a partial overlap case that needs alias checks to be safely
+; transformed into a memcpy.
 @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
 
 define i32 @test14() nounwind {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[T5]], 4
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [7 x i32], ptr @g_50, i32 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[T5]], 5
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [7 x i32], ptr @g_50, i32 0, i64 [[IDXPROM5]]
-; CHECK-NEXT:    store i32 [[T2]], ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[T5]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 2
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[T6:%.*]] = phi i32 [ [[INC1:%.*]], [[FOR_BODY1]] ], [ 0, [[FOR_BODY_PH:%.*]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], 4
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [7 x i32], ptr @g_50, i32 0, i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[T3:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[T6]], 5
+; CHECK-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [7 x i32], ptr @g_50, i32 0, i64 [[IDXPROM6]]
+; CHECK-NEXT:    store i32 [[T3]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INC1]] = add nsw i32 [[T6]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[INC1]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY1]], label [[FOR_END_LOOPEXIT1:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr getelementptr inbounds ([7 x i32], ptr @g_50, i32 0, i64 6), align 4
 ; CHECK-NEXT:    ret i32 [[T8]]
diff --git a/llvm/test/Transforms/LoopIdiom/expander-do-not-delete-reused-values.ll b/llvm/test/Transforms/LoopIdiom/expander-do-not-delete-reused-values.ll
index 0c1f5b9c029989..0a8c190e509486 100644
--- a/llvm/test/Transforms/LoopIdiom/expander-do-not-delete-reused-values.ll
+++ b/llvm/test/Transforms/LoopIdiom/expander-do-not-delete-reused-values.ll
@@ -1,34 +1,94 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-idiom -S %s | FileCheck %s
+; RUN: opt -passes=loop-idiom -enable-loop-idiom-version=0 -S %s | FileCheck %s --check-prefix=CHECK-NO-VERSION
+; RUN: opt -passes=loop-idiom -S %s | FileCheck %s --check-prefix=CHECK-VERSION
 
 ; Make sure we do not delete instructions not inserted during expansion, e.g.
-; because the expande re-used existing instructions.
+; because the expander re-used existing instructions.
 
 define void @test(i64 %init, ptr %ptr) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
-; CHECK:       outer.header:
-; CHECK-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[OUTER_LATCH:%.*]] ]
-; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[OUTER_LATCH]] ]
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i32 [[I_0]]
-; CHECK-NEXT:    br label [[INNER:%.*]]
-; CHECK:       inner:
-; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER]] ], [ [[INIT:%.*]], [[OUTER_HEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[INNER_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ADD_PTR]], i64 [[INNER_IV]]
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], 1
-; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC_1]], label [[OUTER_LATCH]], label [[INNER]]
-; CHECK:       outer.latch:
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_0]], 1
-; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[I_0]], [[INC]]
-; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[ADD]], 4000
-; CHECK-NEXT:    br i1 [[EC_2]], label [[EXIT:%.*]], label [[OUTER_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
+; CHECK-NO-VERSION-LABEL: @test(
+; CHECK-NO-VERSION-NEXT:  entry:
+; CHECK-NO-VERSION-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK-NO-VERSION:       outer.header:
+; CHECK-NO-VERSION-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NO-VERSION-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[OUTER_LATCH]] ]
+; CHECK-NO-VERSION-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i32 [[I_0]]
+; CHECK-NO-VERSION-NEXT:    br label [[INNER:%.*]]
+; CHECK-NO-VERSION:       inner:
+; CHECK-NO-VERSION-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER]] ], [ [[INIT:%.*]], [[OUTER_HEADER]] ]
+; CHECK-NO-VERSION-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[INNER_IV]]
+; CHECK-NO-VERSION-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NO-VERSION-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ADD_PTR]], i64 [[INNER_IV]]
+; CHECK-NO-VERSION-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NO-VERSION-NEXT:    [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], 1
+; CHECK-NO-VERSION-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 0
+; CHECK-NO-VERSION-NEXT:    br i1 [[EC_1]], label [[OUTER_LATCH]], label [[INNER]]
+; CHECK-NO-VERSION:       outer.latch:
+; CHECK-NO-VERSION-NEXT:    [[INC]] = add nuw nsw i32 [[J_0]], 1
+; CHECK-NO-VERSION-NEXT:    [[ADD]] = add nuw nsw i32 [[I_0]], [[INC]]
+; CHECK-NO-VERSION-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[ADD]], 4000
+; CHECK-NO-VERSION-NEXT:    br i1 [[EC_2]], label [[EXIT:%.*]], label [[OUTER_HEADER]]
+; CHECK-NO-VERSION:       exit:
+; CHECK-NO-VERSION-NEXT:    ret void
+;
+; CHECK-VERSION-LABEL: @test(
+; CHECK-VERSION-NEXT:  entry:
+; CHECK-VERSION-NEXT:    [[TMP0:%.*]] = shl i64 [[INIT:%.*]], 2
+; CHECK-VERSION-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]]
+; CHECK-VERSION-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-VERSION-NEXT:    [[TMP9:%.*]] = sub i64 -1, [[INIT]]
+; CHECK-VERSION-NEXT:    [[TMP1:%.*]] = mul i64 [[INIT]], -4
+; CHECK-VERSION-NEXT:    br label [[INNER_LVER_CHECK:%.*]]
+; CHECK-VERSION:       inner.lver.check:
+; CHECK-VERSION-NEXT:    [[J_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-VERSION-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[OUTER_LATCH]] ]
+; CHECK-VERSION-NEXT:    [[TMP2:%.*]] = sext i32 [[I_0]] to i64
+; CHECK-VERSION-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
+; CHECK-VERSION-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 [[TMP3]]
+; CHECK-VERSION-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP3]]
+; CHECK-VERSION-NEXT:    [[TMP4:%.*]] = sext i32 [[I_0]] to i64
+; CHECK-VERSION-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-VERSION-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP5]]
+; CHECK-VERSION-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR]], i32 [[I_0]]
+; CHECK-VERSION-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP3]], [[PTR]]
+; CHECK-VERSION-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP4]]
+; CHECK-VERSION-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VERSION-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP9]], 12
+; CHECK-VERSION-NEXT:    [[TMP8:%.*]] = or i1 [[FOUND_CONFLICT]], [[TMP10]]
+; CHECK-VERSION-NEXT:    br i1 [[TMP8]], label [[INNER_PH_LVER_ORIG:%.*]], label [[INNER_PH:%.*]]
+; CHECK-VERSION:       inner.ph.lver.orig:
+; CHECK-VERSION-NEXT:    br label [[INNER_LVER_ORIG:%.*]]
+; CHECK-VERSION:       inner.lver.orig:
+; CHECK-VERSION-NEXT:    [[INNER_IV_LVER_ORIG:%.*]] = phi i64 [ [[INNER_IV_NEXT_LVER_ORIG:%.*]], [[INNER_LVER_ORIG]] ], [ [[INIT]], [[INNER_PH_LVER_ORIG]] ]
+; CHECK-VERSION-NEXT:    [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[INNER_IV_LVER_ORIG]]
+; CHECK-VERSION-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_LVER_ORIG]], align 4
+; CHECK-VERSION-NEXT:    [[ARRAYIDX3_LVER_ORIG:%.*]] = getelementptr inbounds float, ptr [[ADD_PTR]], i64 [[INNER_IV_LVER_ORIG]]
+; CHECK-VERSION-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX3_LVER_ORIG]], align 4
+; CHECK-VERSION-NEXT:    [[INNER_IV_NEXT_LVER_ORIG]] = add nsw i64 [[INNER_IV_LVER_ORIG]], 1
+; CHECK-VERSION-NEXT:    [[EC_1_LVER_ORIG:%.*]] = icmp eq i64 [[INNER_IV_NEXT_LVER_ORIG]], 0
+; CHECK-VERSION-NEXT:    br i1 [[EC_1_LVER_ORIG]], label [[OUTER_LATCH_LOOPEXIT:%.*]], label [[INNER_LVER_ORIG]]
+; CHECK-VERSION:       inner.ph:
+; CHECK-VERSION-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[SCEVGEP1]], ptr align 4 [[SCEVGEP]], i64 [[TMP1]], i1 false)
+; CHECK-VERSION-NEXT:    br label [[INNER:%.*]]
+; CHECK-VERSION:       inner:
+; CHECK-VERSION-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER]] ], [ [[INIT]], [[INNER_PH]] ]
+; CHECK-VERSION-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[INNER_IV]]
+; CHECK-VERSION-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VERSION-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ADD_PTR]], i64 [[INNER_IV]]
+; CHECK-VERSION-NEXT:    [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], 1
+; CHECK-VERSION-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 0
+; CHECK-VERSION-NEXT:    br i1 [[EC_1]], label [[OUTER_LATCH_LOOPEXIT5:%.*]], label [[INNER]]
+; CHECK-VERSION:       outer.latch.loopexit:
+; CHECK-VERSION-NEXT:    br label [[OUTER_LATCH]]
+; CHECK-VERSION:       outer.latch.loopexit5:
+; CHECK-VERSION-NEXT:    br label [[OUTER_LATCH]]
+; CHECK-VERSION:       outer.latch:
+; CHECK-VERSION-NEXT:    [[INC]] = add nuw nsw i32 [[J_0]], 1
+; CHECK-VERSION-NEXT:    [[ADD]] = add nuw nsw i32 [[I_0]], [[INC]]
+; CHECK-VERSION-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[ADD]], 4000
+; CHECK-VERSION-NEXT:    br i1 [[EC_2]], label [[EXIT:%.*]], label [[INNER_LVER_CHECK]]
+; CHECK-VERSION:       exit:
+; CHECK-VERSION-NEXT:    ret void
 ;
 entry:
   br label %outer.header
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-loop-versioning.ll b/llvm/test/Transforms/LoopIdiom/memcpy-loop-versioning.ll
new file mode 100644
index 00000000000000..4abce57e4bf205
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-loop-versioning.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-idiom < %s -S | FileCheck %s
+
+; If the loop is dedicated one that transfers content from the source to the
+; destination but we don't know if they are alias, version the loop.
+define dso_local void @issue50892(ptr %dst, i64 noundef %d_len, ptr %src) {
+; CHECK-LABEL: define dso_local void @issue50892(
+; CHECK-SAME: ptr [[DST:%.*]], i64 noundef [[D_LEN:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[D_LEN]]
+; CHECK-NEXT:    [[CMP4_NOT:%.*]] = icmp eq i64 [[D_LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP4_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_BODY_LVER_CHECK:.*]]
+; CHECK:       [[WHILE_BODY_LVER_CHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[D_LEN]], [[DST1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[DST1]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[UMAX]], [[DST1]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[D_LEN]], [[DST1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[DST1]], 1
+; CHECK-NEXT:    [[UMAX4:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[UMAX4]], -1
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], [[DST1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP10]], 12
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[FOUND_CONFLICT]], [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[UMAX4]], [[DST1]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[WHILE_BODY_PH_LVER_ORIG:.*]], label %[[WHILE_BODY_PH:.*]]
+; CHECK:       [[WHILE_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY_LVER_ORIG:.*]]
+; CHECK:       [[WHILE_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[DST_ADDR_06_LVER_ORIG:%.*]] = phi ptr [ [[INCDEC_PTR1_LVER_ORIG:%.*]], %[[WHILE_BODY_LVER_ORIG]] ], [ [[DST]], %[[WHILE_BODY_PH_LVER_ORIG]] ]
+; CHECK-NEXT:    [[SRC_ADDR_05_LVER_ORIG:%.*]] = phi ptr [ [[INCDEC_PTR_LVER_ORIG:%.*]], %[[WHILE_BODY_LVER_ORIG]] ], [ [[SRC]], %[[WHILE_BODY_PH_LVER_ORIG]] ]
+; CHECK-NEXT:    [[INCDEC_PTR_LVER_ORIG]] = getelementptr inbounds nuw i8, ptr [[SRC_ADDR_05_LVER_ORIG]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[SRC_ADDR_05_LVER_ORIG]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR1_LVER_ORIG]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_06_LVER_ORIG]], i64 1
+; CHECK-NEXT:    store i8 [[TMP6]], ptr [[DST_ADDR_06_LVER_ORIG]], align 1
+; CHECK-NEXT:    [[CMP_LVER_ORIG:%.*]] = icmp ult ptr [[INCDEC_PTR1_LVER_ORIG]], [[ADD_PTR]]
+; CHECK-NEXT:    br i1 [[CMP_LVER_ORIG]], label %[[WHILE_BODY_LVER_ORIG]], label %[[WHILE_END_LOOPEXIT_LOOPEXIT:.*]]
+; CHECK:       [[WHILE_BODY_PH]]:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[TMP5]], i1 false)
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[DST_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], %[[WHILE_BODY]] ], [ [[DST]], %[[WHILE_BODY_PH]] ]
+; CHECK-NEXT:    [[SRC_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[SRC]], %[[WHILE_BODY_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[SRC_ADDR_05]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[SRC_ADDR_05]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_06]], i64 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR1]], [[ADD_PTR]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_BODY]], label %[[WHILE_END_LOOPEXIT_LOOPEXIT3:.*]]
+; CHECK:       [[WHILE_END_LOOPEXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END_LOOPEXIT:.*]]
+; CHECK:       [[WHILE_END_LOOPEXIT_LOOPEXIT3]]:
+; CHECK-NEXT:    br label %[[WHILE_END_LOOPEXIT]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add.ptr = getelementptr inbounds nuw i8, ptr %dst, i64 %d_len
+  %cmp4.not = icmp eq i64 %d_len, 0
+  br i1 %cmp4.not, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %dst.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %dst, %while.body.preheader ]
+  %src.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %src, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %src.addr.05, i64 1
+  %0 = load i8, ptr %src.addr.05, align 1
+  %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %dst.addr.06, i64 1
+  store i8 %0, ptr %dst.addr.06, align 1
+  %cmp = icmp ult ptr %incdec.ptr1, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+; If the loop has multiple loads or stores, we don't perform loop versioning to
+; prevent code size from growing exponentially.
+define dso_local void @multiple_load_store_pairs(ptr %dst0, ptr %src0, i32 noundef %n, ptr %dst1, ptr %src1) {
+; CHECK-LABEL: define dso_local void @multiple_load_store_pairs(
+; CHECK-SAME: ptr [[DST0:%.*]], ptr [[SRC0:%.*]], i32 noundef [[N:%.*]], ptr [[DST1:%.*]], ptr [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP_NOT12:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT12]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_013:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_013]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC0]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw double, ptr [[SRC1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw double, ptr [[DST1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[INC]] = add i32 [[I_013]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp.not12 = icmp eq i32 %n, 0
+  br i1 %cmp.not12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.013 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.013 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr %src0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %dst0, i64 %idxprom
+  store i8 %0, ptr %arrayidx2, align 1
+  %arrayidx4 = getelementptr inbounds nuw double, ptr %src1, i64 %idxprom
+  %1 = load double, ptr %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds nuw double, ptr %dst1, i64 %idxprom
+  %2 = load double, ptr %arrayidx6, align 8
+  %add = fadd double %1, %2
+  store double %add, ptr %arrayidx6, align 8
+  %inc = add i32 %i.013, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; If there are instructions other than the store, the load, or loop control
+; instructions, we don't perform loop versioning to prevent code size from
+; growing up.
+define dso_local double @other_instrs_exist(ptr %dst, ptr %src, i32 noundef %n) {
+; CHECK-LABEL: define dso_local double @other_instrs_exist(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT8]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[RES_0_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[RES_09:%.*]] = phi double [ [[ADD]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_010]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[I_010]] to double
+; CHECK-NEXT:    [[ADD]] = fadd double [[RES_09]], [[CONV]]
+; CHECK-NEXT:    [[INC]] = add i32 [[I_010]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp.not8 = icmp eq i32 %n, 0
+  br i1 %cmp.not8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %res.0.lcssa
+
+for.body:
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %res.09 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %idxprom = zext i32 %i.010 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr %src, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %dst, i64 %idxprom
+  store i8 %0, ptr %arrayidx2, align 1
+  %conv = uitofp i32 %i.010 to double
+  %add = fadd double %res.09, %conv
+  %inc = add i32 %i.010, 1
+  %cmp.not = icmp eq i32 %inc, %n
+  br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopIdiom/pr82337.ll b/llvm/test/Transforms/LoopIdiom/pr82337.ll
index da9eb14af3f0a4..a3df6cf5ae53b2 100644
--- a/llvm/test/Transforms/LoopIdiom/pr82337.ll
+++ b/llvm/test/Transforms/LoopIdiom/pr82337.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes=loop-idiom < %s | FileCheck %s
+; RUN: opt -S -passes=loop-idiom -enable-loop-idiom-version=0 < %s | FileCheck %s
 
 ; The poison flags should be preserved, as no transform takes place.
 define void @test(ptr %p.end, ptr %p.start) {