[llvm] [LoopInterchange] Support inner-loop simple reductions via UndoSimpleReduction (PR #172970)

Thu Dec 25 02:07:08 PST 2025

https://github.com/buggfg updated https://github.com/llvm/llvm-project/pull/172970

>From 9e354c81a94d2dcaf6c1603dcf31ef0fd453df79 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Fri, 19 Dec 2025 16:11:26 +0800
Subject: [PATCH 01/10] Support inner-loop simple reductions via
 UndoSimpleReduction

Co-Authored-By: ict-ql <168183727+ict-ql at users.noreply.github.com>
Co-Authored-By: Lin Wang <wanglulin at ict.ac.cn>
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 411 ++++++++++++++----
 .../LoopInterchange/simple-reduction.ll       |  86 ++++
 2 files changed, 414 insertions(+), 83 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/simple-reduction.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 330b4abb9942f..3da23c7f9ae11 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -120,6 +121,12 @@ static cl::list<RuleTy> Profitabilities(
                           "Ignore profitability, force interchange (does not "
                           "work with other options)")));
 
+// Support for simple reduction of inner loop.
+static cl::opt<bool>
+    EnableUndoSimpleReduction("undo-simple-reduction", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("Support for simple reduction of inner loop."));
+
 #ifndef NDEBUG
 static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
   SmallSet<RuleTy, 4> Set;
@@ -446,8 +453,8 @@ namespace {
 class LoopInterchangeLegality {
 public:
   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                          OptimizationRemarkEmitter *ORE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+                          OptimizationRemarkEmitter *ORE, DominatorTree *DT)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -475,9 +482,30 @@ class LoopInterchangeLegality {
     return HasNoWrapReductions;
   }
 
+  // Record the simple reduction in the inner loop.
+  struct SimpleReduction {
+    // The reduction itself;
+    PHINode *Re;
+    // So far only supports constant initial value.
+    Value *Init;
+    Value *Next;
+    // The Lcssa PHI
+    PHINode *LcssaPhi;
+    // Only supports one user for now
+    // Store reduction result into memory object
+    StoreInst *LcssaStorer;
+    // The memory Location
+    Value *MemRef;
+    Type *ElemTy;
+  };
+
+  const ArrayRef<SimpleReduction *> getInnerSimpleReductions() const {
+    return InnerSimpleReductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
-  bool containsUnsafeInstructions(BasicBlock *BB);
+  bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
 
   /// Discover induction and reduction PHIs in the header of \p L. Induction
   /// PHIs are added to \p Inductions, reductions are added to
@@ -487,11 +515,16 @@ class LoopInterchangeLegality {
                                   SmallVector<PHINode *, 8> &Inductions,
                                   Loop *InnerLoop);
 
+  /// Detect simple-reduction PHIs in the inner loop. Add them to
+  /// InnerSimpleReductions.
+  bool findSimpleReduction(Loop *L, PHINode *Phi,
+                           SmallVectorImpl<Instruction *> &HasNoWrapInsts);
+
   Loop *OuterLoop;
   Loop *InnerLoop;
 
   ScalarEvolution *SE;
-
+  DominatorTree *DT;
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
@@ -506,6 +539,9 @@ class LoopInterchangeLegality {
   /// like integer addition/multiplication. Those flags must be dropped when
   /// interchanging the loops.
   SmallVector<Instruction *, 4> HasNoWrapReductions;
+
+  /// Vector of simple reductions of inner loop.
+  SmallVector<SimpleReduction *, 8> InnerSimpleReductions;
 };
 
 /// Manages information utilized by the profitability check for cache. The main
@@ -575,6 +611,7 @@ class LoopInterchangeTransform {
 
   /// Interchange OuterLoop and InnerLoop.
   bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
+  void undoSimpleReduction();
   void restructureLoops(Loop *NewInner, Loop *NewOuter,
                         BasicBlock *OrigInnerPreHeader,
                         BasicBlock *OrigOuterPreHeader);
@@ -693,7 +730,7 @@ struct LoopInterchange {
     Loop *InnerLoop = LoopList[InnerLoopId];
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
-    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
       return false;
@@ -734,8 +771,11 @@ struct LoopInterchange {
 
 } // end anonymous namespace
 
-bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
-  return any_of(*BB, [](const Instruction &I) {
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
+                                                         Instruction *Skip) {
+  return any_of(*BB, [Skip](const Instruction &I) {
+    if (&I == Skip)
+      return false;
     return I.mayHaveSideEffects() || I.mayReadFromMemory();
   });
 }
@@ -761,17 +801,27 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
       return false;
 
   LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+
+  // The inner loop simple-reduction pattern requires storing the LCSSA PHI in
+  // the OuterLoop Latch. Therefore, when UndoSimpleReduction is enabled, skip
+  // that store during checks.
+  Instruction *Skip = nullptr;
+  if (EnableUndoSimpleReduction) {
+    if (InnerSimpleReductions.size() == 1)
+      Skip = InnerSimpleReductions[0]->LcssaStorer;
+  }
+
   // We do not have any basic block in between now make sure the outer header
   // and outer loop latch doesn't contain any unsafe instructions.
-  if (containsUnsafeInstructions(OuterLoopHeader) ||
-      containsUnsafeInstructions(OuterLoopLatch))
+  if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
+      containsUnsafeInstructions(OuterLoopLatch, Skip))
     return false;
 
   // Also make sure the inner loop preheader does not contain any unsafe
   // instructions. Note that all instructions in the preheader will be moved to
   // the outer loop header when interchanging.
   if (InnerLoopPreHeader != OuterLoopHeader &&
-      containsUnsafeInstructions(InnerLoopPreHeader))
+      containsUnsafeInstructions(InnerLoopPreHeader, Skip))
     return false;
 
   BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
@@ -787,7 +837,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   // The inner loop exit block does flow to the outer loop latch and not some
   // other BBs, now make sure it contains safe instructions, since it will be
   // moved into the (new) inner loop after interchange.
-  if (containsUnsafeInstructions(InnerLoopExit))
+  if (containsUnsafeInstructions(InnerLoopExit, Skip))
     return false;
 
   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
@@ -898,6 +948,77 @@ static Value *followLCSSA(Value *SV) {
   return followLCSSA(PHI->getIncomingValue(0));
 }
 
+bool CheckReductionKind(Loop *L, PHINode *PHI,
+                        SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+  RecurrenceDescriptor RD;
+  if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
+    // Detect floating point reduction only when it can be reordered.
+    if (RD.getExactFPMathInst() != nullptr)
+      return false;
+
+    RecurKind RK = RD.getRecurrenceKind();
+    switch (RK) {
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::SMin:
+    case RecurKind::SMax:
+    case RecurKind::UMin:
+    case RecurKind::UMax:
+    case RecurKind::FAdd:
+    case RecurKind::FMul:
+    case RecurKind::FMin:
+    case RecurKind::FMax:
+    case RecurKind::FMinimum:
+    case RecurKind::FMaximum:
+    case RecurKind::FMinimumNum:
+    case RecurKind::FMaximumNum:
+    case RecurKind::FMulAdd:
+    case RecurKind::AnyOf:
+      return true;
+
+    // Change the order of integer addition/multiplication may change the
+    // semantics. Consider the following case:
+    //
+    //  int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
+    //  int sum = 0;
+    //  for (int i = 0; i < 2; i++)
+    //    for (int j = 0; j < 2; j++)
+    //      sum += A[j][i];
+    //
+    // If the above loops are exchanged, the addition will cause an
+    // overflow. To prevent this, we must drop the nuw/nsw flags from the
+    // addition/multiplication instructions when we actually exchanges the
+    // loops.
+    case RecurKind::Add:
+    case RecurKind::Mul: {
+      unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
+      SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
+
+      // Bail out when we fail to collect reduction instructions chain.
+      if (Ops.empty())
+        return false;
+
+      for (Instruction *I : Ops) {
+        assert(I->getOpcode() == OpCode &&
+               "Expected the instruction to be the reduction operation");
+        (void)OpCode;
+
+        // If the instruction has nuw/nsw flags, we must drop them when the
+        // transformation is actually performed.
+        if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
+          HasNoWrapInsts.push_back(I);
+      }
+      return true;
+    }
+
+    default:
+      return false;
+    }
+  } else
+    return false;
+}
+
 // Check V's users to see if it is involved in a reduction in L.
 static PHINode *
 findInnerReductionPhi(Loop *L, Value *V,
@@ -910,72 +1031,12 @@ findInnerReductionPhi(Loop *L, Value *V,
     if (PHINode *PHI = dyn_cast<PHINode>(User)) {
       if (PHI->getNumIncomingValues() == 1)
         continue;
-      RecurrenceDescriptor RD;
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
-        // Detect floating point reduction only when it can be reordered.
-        if (RD.getExactFPMathInst() != nullptr)
-          return nullptr;
-
-        RecurKind RK = RD.getRecurrenceKind();
-        switch (RK) {
-        case RecurKind::Or:
-        case RecurKind::And:
-        case RecurKind::Xor:
-        case RecurKind::SMin:
-        case RecurKind::SMax:
-        case RecurKind::UMin:
-        case RecurKind::UMax:
-        case RecurKind::FAdd:
-        case RecurKind::FMul:
-        case RecurKind::FMin:
-        case RecurKind::FMax:
-        case RecurKind::FMinimum:
-        case RecurKind::FMaximum:
-        case RecurKind::FMinimumNum:
-        case RecurKind::FMaximumNum:
-        case RecurKind::FMulAdd:
-        case RecurKind::AnyOf:
-          return PHI;
-
-        // Change the order of integer addition/multiplication may change the
-        // semantics. Consider the following case:
-        //
-        //  int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
-        //  int sum = 0;
-        //  for (int i = 0; i < 2; i++)
-        //    for (int j = 0; j < 2; j++)
-        //      sum += A[j][i];
-        //
-        // If the above loops are exchanged, the addition will cause an
-        // overflow. To prevent this, we must drop the nuw/nsw flags from the
-        // addition/multiplication instructions when we actually exchanges the
-        // loops.
-        case RecurKind::Add:
-        case RecurKind::Mul: {
-          unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
-          SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
-
-          // Bail out when we fail to collect reduction instructions chain.
-          if (Ops.empty())
-            return nullptr;
-
-          for (Instruction *I : Ops) {
-            assert(I->getOpcode() == OpCode &&
-                   "Expected the instruction to be the reduction operation");
-            (void)OpCode;
-
-            // If the instruction has nuw/nsw flags, we must drop them when the
-            // transformation is actually performed.
-            if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
-              HasNoWrapInsts.push_back(I);
-          }
-          return PHI;
-        }
 
-        default:
-          return nullptr;
-        }
-      }
+      if (CheckReductionKind(L, PHI, HasNoWrapInsts))
+        return PHI;
+      else
+        return nullptr;
+
       return nullptr;
     }
   }
@@ -983,6 +1044,116 @@ findInnerReductionPhi(Loop *L, Value *V,
   return nullptr;
 }
 
+// Detect and record the simple reduction of the inner loop.
+//
+//    innerloop:
+//        Re = phi<0.0, Next>
+//        ReUser = Re op ...
+//        ...
+//        Next = ReUser op ...
+//    OuterLoopLatch:
+//        Lcssa = phi<Next>    ; lcssa phi
+//        store Lcssa, MemRef  ; LcssaStorer
+//
+bool LoopInterchangeLegality::findSimpleReduction(
+    Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+
+  // Only support undo simple reduction if the loop nest to be interchanged is
+  // the innermostin two loops.
+  if (!L->isInnermost())
+    return false;
+
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+  Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
+
+  // So far only supports constant initial value.
+  auto *ConstInit = dyn_cast<Constant>(Init);
+  if (!ConstInit)
+    return false;
+
+  // The reduction result must live in the inner loop.
+  if (Instruction *I = dyn_cast<Instruction>(Next)) {
+    BasicBlock *BB = I->getParent();
+    if (!L->contains(BB))
+      return false;
+  }
+
+  // The reduction should have only one user.
+  if (!Phi->hasOneUser())
+    return false;
+  Instruction *ReUser = dyn_cast<Instruction>(Phi->getUniqueUndroppableUser());
+  if (!ReUser || !L->contains(ReUser->getParent()))
+    return false;
+
+  // Check the reduction operation.
+  if (!ReUser->isAssociative() || !ReUser->isBinaryOp() ||
+      (ReUser->getOpcode() == Instruction::Sub &&
+       ReUser->getOperand(0) == Phi) ||
+      (ReUser->getOpcode() == Instruction::FSub &&
+       ReUser->getOperand(0) == Phi))
+    return false;
+
+  // Check the reduction kind.
+  if (ReUser != Next && !CheckReductionKind(L, Phi, HasNoWrapInsts))
+    return false;
+
+  // Find lcssa_phi in OuterLoop's Latch
+  if (!L->getExitingBlock())
+    return false;
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return false;
+  BasicBlock *ExitBlock =
+      BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+  if (!ExitBlock)
+    return false;
+
+  PHINode *Lcssa = NULL;
+  for (auto *U : Next->users()) {
+    if (auto *P = dyn_cast<PHINode>(U)) {
+      if (P == Phi)
+        continue;
+
+      if (Lcssa == NULL && P->getParent() == ExitBlock &&
+          P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
+        Lcssa = P;
+    } else
+      return false;
+  }
+  if (!Lcssa || !Lcssa->hasOneUser())
+    return false;
+
+  StoreInst *LcssaStorer =
+      dyn_cast<StoreInst>(Lcssa->getUniqueUndroppableUser());
+  if (!LcssaStorer)
+    return false;
+
+  Value *MemRef = LcssaStorer->getOperand(1);
+  Type *ElemTy = LcssaStorer->getOperand(0)->getType();
+
+  // LcssaStorer stores the reduction result in BB. undoSimpleReduction() will
+  // move it into the inner loop. Here we must ensure that the memory reference
+  // and its operands dominate the target block; otherwise the move is unsafe.
+  if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
+    return false;
+
+  // Found a simple reduction of inner loop.
+  SimpleReduction *SR = new SimpleReduction;
+  SR->Re = Phi;
+  SR->Init = Init;
+  SR->Next = Next;
+  SR->LcssaPhi = Lcssa;
+  SR->LcssaStorer = LcssaStorer;
+  SR->MemRef = MemRef;
+  SR->ElemTy = ElemTy;
+
+  InnerSimpleReductions.push_back(&*SR);
+  return true;
+}
+
 bool LoopInterchangeLegality::findInductionAndReductions(
     Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
@@ -995,11 +1166,14 @@ bool LoopInterchangeLegality::findInductionAndReductions(
       // PHIs in inner loops need to be part of a reduction in the outer loop,
       // discovered when checking the PHIs of the outer loop earlier.
       if (!InnerLoop) {
-        if (!OuterInnerReductions.count(&PHI)) {
-          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
-                               "across the outer loop.\n");
+        if (OuterInnerReductions.count(&PHI)) {
+          LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
+        } else if (EnableUndoSimpleReduction &&
+                   findSimpleReduction(L, &PHI, HasNoWrapReductions)) {
+          LLVM_DEBUG(dbgs() << "Found a simple reduction in the inner loop: \n"
+                            << PHI << '\n');
+        } else
           return false;
-        }
       } else {
         assert(PHI.getNumIncomingValues() == 2 &&
                "Phis in loop header should have exactly 2 incoming values");
@@ -1020,6 +1194,10 @@ bool LoopInterchangeLegality::findInductionAndReductions(
       }
     }
   }
+
+  // For now we only support at most one reduction.
+  if (InnerSimpleReductions.size() > 1)
+    return false;
   return true;
 }
 
@@ -1115,12 +1293,15 @@ bool LoopInterchangeLegality::findInductions(
 // the we are only interested in the final value after the loop).
 static bool
 areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
-                              SmallPtrSetImpl<PHINode *> &Reductions) {
+                              SmallPtrSetImpl<PHINode *> &Reductions,
+                              PHINode *LcssaSimpleRed) {
   BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
   for (PHINode &PHI : InnerExit->phis()) {
     // Reduction lcssa phi will have only 1 incoming block that from loop latch.
     if (PHI.getNumIncomingValues() > 1)
       return false;
+    if (&PHI == LcssaSimpleRed)
+      return true;
     if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
           PHINode *PN = dyn_cast<PHINode>(U);
           return !PN ||
@@ -1270,8 +1451,16 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     return false;
   }
 
-  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
-                                     OuterInnerReductions)) {
+  // The LCSSA PHI for the simple reduction has passed checks before; its user
+  // is a store instruction.
+  PHINode *LcssaSimpleRed = nullptr;
+  if (EnableUndoSimpleReduction) {
+    if (InnerSimpleReductions.size() == 1)
+      LcssaSimpleRed = InnerSimpleReductions[0]->LcssaPhi;
+  }
+
+  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
+                                     LcssaSimpleRed)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
@@ -1633,10 +1822,66 @@ void LoopInterchangeTransform::restructureLoops(
   SE->forgetLoop(NewOuter);
 }
 
+/*
+  User can write, optimizers can generate simple reduction for inner loop. In
+  order to make interchange valid, we have to undo reduction by moving th
+  initialization and store instructions into the inner loop. So far we only
+  handle cases where the reduction variable is initialized to a constant.
+  For example, below code:
+
+  loop:
+    re = phi<0.0, next>
+    next = re op ...
+  reduc_sum = phi<next>       // lcssa phi
+  MEM_REF[idx] = reduc_sum		// LcssaStorer
+
+  is transformed into:
+
+  loop:
+    tmp = MEM_REF[idx];
+    new_var = !first_iteration ? tmp : 0.0;
+    next = new_var op ...
+    MEM_REF[idx] = next;		// after moving
+
+  In this way the initial const is used in the first iteration of loop.
+*/
+void LoopInterchangeTransform::undoSimpleReduction() {
+
+  auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+  LoopInterchangeLegality::SimpleReduction *SR = InnerSimpleReductions[0];
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
+
+  // When the reduction is intialized from constant value, we need to add
+  // a stmt loading from the memory object to target basic block in inner
+  // loop during undoing the reduction.
+  Instruction *LoadMem = Builder.CreateLoad(SR->ElemTy, SR->MemRef);
+
+  // Check if it's the first iteration.
+  auto &InductionPHIs = LIL.getInnerLoopInductions();
+  PHINode *IV = InductionPHIs[0];
+  Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
+  Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
+
+  // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
+  Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR->Init, "new.var");
+
+  // Replace all uses of reduction var with new variable.
+  SR->Re->replaceAllUsesWith(NewVar);
+
+  // Move store instruction into inner loop, just after reduction next's def.
+  SR->LcssaStorer->setOperand(0, SR->Next);
+  SR->LcssaStorer->moveAfter(dyn_cast<Instruction>(SR->Next));
+}
+
 bool LoopInterchangeTransform::transform(
     ArrayRef<Instruction *> DropNoWrapInsts) {
   bool Transformed = false;
 
+  auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+  if (EnableUndoSimpleReduction && InnerSimpleReductions.size() == 1)
+    undoSimpleReduction();
+
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
     LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
new file mode 100644
index 0000000000000..9a4393f827a36
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -0,0 +1,86 @@
+; NOTE: Support simple reduction in the inner loop by undoing the simple reduction.
+; RUN: opt < %s -passes="loop(loop-interchange),dce"  -undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
+
+; for (int i = 0; i < n; i++) {
+;   s[i] = 0;
+;   for (int j = 0; j < n; j++)
+;     s[i] = s[i] + a[j][i] * b[j][i];
+; }
+
+define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+; CHECK-LABEL: define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[INNERLOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       outerloop_header.preheader:
+; CHECK-NEXT:    br label [[OUTERLOOP_HEADER:%.*]]
+; CHECK:       outerloop_header:
+; CHECK-NEXT:    [[INDEX_I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[OUTERLOOP_LATCH:%.*]] ], [ 0, [[OUTERLOOPHEADER_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[ADDR_S:%.*]] = getelementptr inbounds nuw double, ptr %s, i64 [[INDEX_I]]
+; CHECK-NEXT:    [[ADDR_A:%.*]] = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT:    [[ADDR_B:%.*]] = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT:    br label [[INNERLOOP_SPLIT1:%.*]]
+; CHECK:       innerloop.preheader:
+; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
+; CHECK:       innerloop:
+; CHECK-NEXT:    [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT:    br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
+; CHECK:       innerloop.split1:
+; CHECK-NEXT:    [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
+; CHECK-NEXT:    [[FIRSTITER:%.*]] = icmp ne i64 [[INDEX_J]], 0
+; CHECK-NEXT:    [[NEW_VAR:%.*]] = select i1 [[FIRSTITER]], double [[S]], double 0.000000e+00
+; CHECK-NEXT:    [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_A]], i64 [[INDEX_J]]
+; CHECK-NEXT:    [[A_J_I:%.*]] = load double, ptr [[ADDR_A_J_I]], align 8
+; CHECK-NEXT:    [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_B]], i64 [[INDEX_J]]
+; CHECK-NEXT:    [[B_J_I:%.*]] = load double, ptr [[ADDR_B_J_I]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ADDR_S]], align 8
+; CHECK-NEXT:    br label [[OUTERLOOP_LATCH:%.*]]
+; CHECK:       innerloop.split:
+; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[J_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT_LOOPEXIT:%.*]], label [[INNERLOOP]]
+; CHECK:       outerloop_latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[INDEX_I]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[INNERLOOP_SPLIT:%.*]], label [[OUTERLOOP_HEADER]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:                                      
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  br label %innerloop
+
+innerloop:                                     
+  %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+  %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:         
+  %lcssa = phi double [ %add, %innerloop ]
+  store double %lcssa, ptr %addr_s, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %outerloop_header
+
+exit:                                 
+  ret void
+}

>From 404657762e6c223894e5a4635302cba99514ff83 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Fri, 19 Dec 2025 16:29:45 +0800
Subject: [PATCH 02/10] Correct the format.

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 3da23c7f9ae11..329cb2189827e 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -122,10 +122,9 @@ static cl::list<RuleTy> Profitabilities(
                           "work with other options)")));
 
 // Support for simple reduction of inner loop.
-static cl::opt<bool>
-    EnableUndoSimpleReduction("undo-simple-reduction", cl::init(false),
-                              cl::Hidden,
-                              cl::desc("Support for simple reduction of inner loop."));
+static cl::opt<bool> EnableUndoSimpleReduction(
+    "undo-simple-reduction", cl::init(false), cl::Hidden,
+    cl::desc("Support for simple reduction of inner loop."));
 
 #ifndef NDEBUG
 static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {

>From cbeeb15da98d7a544a71450fc495191bf2813baf Mon Sep 17 00:00:00 2001
From: Yingying Wang <3171290993 at qq.com>
Date: Mon, 22 Dec 2025 18:07:58 +0800
Subject: [PATCH 03/10] Correct the format of the comments.

Co-authored-by: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 329cb2189827e..19da518caed2a 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1043,17 +1043,17 @@ findInnerReductionPhi(Loop *L, Value *V,
   return nullptr;
 }
 
-// Detect and record the simple reduction of the inner loop.
-//
-//    innerloop:
-//        Re = phi<0.0, Next>
-//        ReUser = Re op ...
-//        ...
-//        Next = ReUser op ...
-//    OuterLoopLatch:
-//        Lcssa = phi<Next>    ; lcssa phi
-//        store Lcssa, MemRef  ; LcssaStorer
-//
+/// Detect and record the simple reduction of the inner loop.
+///
+///    innerloop:
+///        Re = phi<0.0, Next>
+///        ReUser = Re op ...
+///        ...
+///        Next = ReUser op ...
+///    OuterLoopLatch:
+///        Lcssa = phi<Next>    ; lcssa phi
+///        store Lcssa, MemRef  ; LcssaStorer
+///
 bool LoopInterchangeLegality::findSimpleReduction(
     Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
 

>From 66f55c6d1dff845adbc822eff6d230b6120c6e7f Mon Sep 17 00:00:00 2001
From: Yingying Wang <3171290993 at qq.com>
Date: Mon, 22 Dec 2025 18:08:31 +0800
Subject: [PATCH 04/10] Correct the format of the comments.

Co-authored-by: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 19da518caed2a..31778ea028f00 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1821,29 +1821,27 @@ void LoopInterchangeTransform::restructureLoops(
   SE->forgetLoop(NewOuter);
 }
 
-/*
-  User can write, optimizers can generate simple reduction for inner loop. In
-  order to make interchange valid, we have to undo reduction by moving th
-  initialization and store instructions into the inner loop. So far we only
-  handle cases where the reduction variable is initialized to a constant.
-  For example, below code:
-
-  loop:
-    re = phi<0.0, next>
-    next = re op ...
-  reduc_sum = phi<next>       // lcssa phi
-  MEM_REF[idx] = reduc_sum		// LcssaStorer
-
-  is transformed into:
-
-  loop:
-    tmp = MEM_REF[idx];
-    new_var = !first_iteration ? tmp : 0.0;
-    next = new_var op ...
-    MEM_REF[idx] = next;		// after moving
-
-  In this way the initial const is used in the first iteration of loop.
-*/
+///  User can write, optimizers can generate simple reduction for inner loop. In
+///  order to make interchange valid, we have to undo reduction by moving th
+///  initialization and store instructions into the inner loop. So far we only
+///  handle cases where the reduction variable is initialized to a constant.
+///  For example, below code:
+///
+///  loop:
+///    re = phi<0.0, next>
+///    next = re op ...
+///  reduc_sum = phi<next>       // lcssa phi
+///  MEM_REF[idx] = reduc_sum		// LcssaStorer
+///
+///  is transformed into:
+///
+///  loop:
+///    tmp = MEM_REF[idx];
+///    new_var = !first_iteration ? tmp : 0.0;
+///    next = new_var op ...
+///    MEM_REF[idx] = next;		// after moving
+///
+///  In this way the initial const is used in the first iteration of loop.
 void LoopInterchangeTransform::undoSimpleReduction() {
 
   auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();

>From e234c58b151c7161ce4e85c5a8a2a7be21269792 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 17:45:38 +0800
Subject: [PATCH 05/10] Add four negative tests and improve security

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 142 ++++++++---
 .../simple-reduction-limitation.ll            | 240 ++++++++++++++++++
 .../LoopInterchange/simple-reduction.ll       |   8 +-
 3 files changed, 351 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 31778ea028f00..7cd6744c34528 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -123,7 +123,7 @@ static cl::list<RuleTy> Profitabilities(
 
 // Support for simple reduction of inner loop.
 static cl::opt<bool> EnableUndoSimpleReduction(
-    "undo-simple-reduction", cl::init(false), cl::Hidden,
+    "loop-interchange-undo-simple-reduction", cl::init(false), cl::Hidden,
     cl::desc("Support for simple reduction of inner loop."));
 
 #ifndef NDEBUG
@@ -496,9 +496,12 @@ class LoopInterchangeLegality {
     // The memory Location
     Value *MemRef;
     Type *ElemTy;
+
+    /// IV used for the loop exit condition.
+    PHINode *CounterIV;
   };
 
-  const ArrayRef<SimpleReduction *> getInnerSimpleReductions() const {
+  const ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
     return InnerSimpleReductions;
   }
 
@@ -540,7 +543,7 @@ class LoopInterchangeLegality {
   SmallVector<Instruction *, 4> HasNoWrapReductions;
 
   /// Vector of simple reductions of inner loop.
-  SmallVector<SimpleReduction *, 8> InnerSimpleReductions;
+  SmallVector<SimpleReduction, 8> InnerSimpleReductions;
 };
 
 /// Manages information utilized by the profitability check for cache. The main
@@ -806,8 +809,10 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   // that store during checks.
   Instruction *Skip = nullptr;
   if (EnableUndoSimpleReduction) {
+    assert(InnerSimpleReductions.size() <= 1 &&
+           "So far we only support at most one reduction.");
     if (InnerSimpleReductions.size() == 1)
-      Skip = InnerSimpleReductions[0]->LcssaStorer;
+      Skip = InnerSimpleReductions[0].LcssaStorer;
   }
 
   // We do not have any basic block in between now make sure the outer header
@@ -947,7 +952,7 @@ static Value *followLCSSA(Value *SV) {
   return followLCSSA(PHI->getIncomingValue(0));
 }
 
-bool CheckReductionKind(Loop *L, PHINode *PHI,
+static bool CheckReductionKind(Loop *L, PHINode *PHI,
                         SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
   RecurrenceDescriptor RD;
   if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
@@ -1035,14 +1040,71 @@ findInnerReductionPhi(Loop *L, Value *V,
         return PHI;
       else
         return nullptr;
+    }
+  }
 
       return nullptr;
     }
+
+static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return nullptr;
+
+  if (IncI->getOpcode() != Instruction::Add &&
+      IncI->getOpcode() != Instruction::Sub)
+    return nullptr;
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    return Phi;
+  }
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    return Phi;
   }
 
   return nullptr;
 }
 
+/// UndoSimpleReduction requires the first_iteration check, so look for
+/// the IV used for the loop exit condition
+static PHINode *findCounterIV(Loop *L) {
+
+  assert(L->getLoopLatch() && "Must be in simplified form");
+
+  BranchInst *BI = cast<BranchInst>(L->getLoopLatch()->getTerminator());
+  if (L->isLoopInvariant(BI->getCondition()))
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!L->isLoopInvariant(RHS)) {
+    if (!L->isLoopInvariant(LHS))
+      return nullptr;
+    std::swap(LHS, RHS);
+  }
+
+  // IndVar = phi[{InitialValue, preheader}, {StepInst, latch}]
+  // StepInst = IndVar + step
+  // case 1:
+  // cmp = IndVar < FinalValue
+  PHINode *Counter = dyn_cast<PHINode>(LHS);
+  // case 2:
+  // cmp = StepInst < FinalValue
+  if (!Counter)
+    Counter = getCounterFromInc(LHS, L);
+
+  return Counter;
+}
+
 /// Detect and record the simple reduction of the inner loop.
 ///
 ///    innerloop:
@@ -1069,8 +1131,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
   Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
 
   // So far only supports constant initial value.
-  auto *ConstInit = dyn_cast<Constant>(Init);
-  if (!ConstInit)
+  if (!isa<Constant>(Init))
     return false;
 
   // The reduction result must live in the inner loop.
@@ -1088,11 +1149,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
     return false;
 
   // Check the reduction operation.
-  if (!ReUser->isAssociative() || !ReUser->isBinaryOp() ||
-      (ReUser->getOpcode() == Instruction::Sub &&
-       ReUser->getOperand(0) == Phi) ||
-      (ReUser->getOpcode() == Instruction::FSub &&
-       ReUser->getOperand(0) == Phi))
+  if (!ReUser->isAssociative())
     return false;
 
   // Check the reduction kind.
@@ -1100,13 +1157,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
     return false;
 
   // Find lcssa_phi in OuterLoop's Latch
-  if (!L->getExitingBlock())
-    return false;
-  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
-  if (!BI)
-    return false;
-  BasicBlock *ExitBlock =
-      BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+  BasicBlock *ExitBlock = L->getExitBlock();;
   if (!ExitBlock)
     return false;
 
@@ -1119,6 +1170,8 @@ bool LoopInterchangeLegality::findSimpleReduction(
       if (Lcssa == NULL && P->getParent() == ExitBlock &&
           P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
         Lcssa = P;
+      else
+        return false;
     } else
       return false;
   }
@@ -1139,17 +1192,23 @@ bool LoopInterchangeLegality::findSimpleReduction(
   if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
     return false;
 
+  // find the IV used for the loop exit condition.
+  PHINode *CounterIV = findCounterIV(L);
+  if (!CounterIV)
+    return false;
+
   // Found a simple reduction of inner loop.
-  SimpleReduction *SR = new SimpleReduction;
-  SR->Re = Phi;
-  SR->Init = Init;
-  SR->Next = Next;
-  SR->LcssaPhi = Lcssa;
-  SR->LcssaStorer = LcssaStorer;
-  SR->MemRef = MemRef;
-  SR->ElemTy = ElemTy;
-
-  InnerSimpleReductions.push_back(&*SR);
+  SimpleReduction SR;
+  SR.Re = Phi;
+  SR.Init = Init;
+  SR.Next = Next;
+  SR.LcssaPhi = Lcssa;
+  SR.LcssaStorer = LcssaStorer;
+  SR.MemRef = MemRef;
+  SR.ElemTy = ElemTy;
+  SR.CounterIV = CounterIV;
+
+  InnerSimpleReductions.push_back(SR);
   return true;
 }
 
@@ -1454,8 +1513,10 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // is a store instruction.
   PHINode *LcssaSimpleRed = nullptr;
   if (EnableUndoSimpleReduction) {
+    assert(InnerSimpleReductions.size() <= 1 &&
+           "So far we only support at most one reduction.");
     if (InnerSimpleReductions.size() == 1)
-      LcssaSimpleRed = InnerSimpleReductions[0]->LcssaPhi;
+      LcssaSimpleRed = InnerSimpleReductions[0].LcssaPhi;
   }
 
   if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
@@ -1844,31 +1905,36 @@ void LoopInterchangeTransform::restructureLoops(
 ///  In this way the initial const is used in the first iteration of loop.
 void LoopInterchangeTransform::undoSimpleReduction() {
 
-  auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
-  LoopInterchangeLegality::SimpleReduction *SR = InnerSimpleReductions[0];
+  ArrayRef<LoopInterchangeLegality::SimpleReduction> InnerSimpleReductions =
+      LIL.getInnerSimpleReductions();
+
+  assert(InnerSimpleReductions.size() == 1 &&
+         "So far we only support at most one reduction.");
+
+  LoopInterchangeLegality::SimpleReduction SR = InnerSimpleReductions[0];
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
 
   // When the reduction is intialized from constant value, we need to add
   // a stmt loading from the memory object to target basic block in inner
   // loop during undoing the reduction.
-  Instruction *LoadMem = Builder.CreateLoad(SR->ElemTy, SR->MemRef);
+  Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
 
   // Check if it's the first iteration.
   auto &InductionPHIs = LIL.getInnerLoopInductions();
-  PHINode *IV = InductionPHIs[0];
+  PHINode *IV = SR.CounterIV;
   Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
   Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
 
   // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
-  Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR->Init, "new.var");
+  Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR.Init, "new.var");
 
   // Replace all uses of reduction var with new variable.
-  SR->Re->replaceAllUsesWith(NewVar);
+  SR.Re->replaceAllUsesWith(NewVar);
 
   // Move store instruction into inner loop, just after reduction next's def.
-  SR->LcssaStorer->setOperand(0, SR->Next);
-  SR->LcssaStorer->moveAfter(dyn_cast<Instruction>(SR->Next));
+  SR.LcssaStorer->setOperand(0, SR.Next);
+  SR.LcssaStorer->moveAfter(dyn_cast<Instruction>(SR.Next));
 }
 
 bool LoopInterchangeTransform::transform(
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
new file mode 100644
index 0000000000000..f4a2266ef9ffe
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -0,0 +1,240 @@
+; Several cases of undoing simple reductions that have not yet been supported.
+; RUN: opt < %s -passes="loop-interchange"  -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
+; RUN:            -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+
+; 1. The initial value of the reduction is not a constant.
+; for (int i = 0; i < n; i++) {
+;   for (int j = 0; j < n; j++)
+;     s[i] = s[i] + a[j][i] * b[j][i];
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIInner
+; CHECK-NEXT: Function:        simple_reduction_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_01(
+; IR-NOT: split
+define void @simple_reduction_01(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %s_init = load double, ptr %addr_s, align 8
+  br label %innerloop
+
+innerloop:
+  %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+  %reduction = phi double [ %s_init, %outerloop_header ], [ %add, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+  %lcssa = phi double [ %add, %innerloop ]
+  store double %lcssa, ptr %addr_s, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+  ret void
+}
+
+; 2. There are two or more reductions
+; for (int i = 0; i < n; i++) {
+;   s[i] = 0;
+;   s2[i] = 0;
+;   for (int j = 0; j < n; j++){
+;     s[i] = s[i] + a[j][i] * b[j][i];
+;     s2[i] = s2[i] + a[j][i];
+;   }
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIInner
+; CHECK-NEXT: Function:        simple_reduction_02
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_02(
+; IR-NOT: split
+define void @simple_reduction_02(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, ptr noalias noundef writeonly captures(none) %s2, i64  noundef %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  %addr_s2 = getelementptr inbounds nuw double, ptr %s2, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  br label %innerloop
+
+innerloop:
+  %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+  %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+  %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  %add2 = fadd fast double %reduction2, %0
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+  %lcssa = phi double [ %add, %innerloop ]
+  %lcssa2 = phi double [%add2, %innerloop]
+  store double %lcssa, ptr %addr_s, align 8
+  store double %lcssa2, ptr %addr_s2, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+  ret void
+}
+
+; 3. The reduction is used more than twice in the outer loop.
+; for (int i = 0; i < n; i++) {
+;   s[i] = 0;
+;   for (int j = 0; j < n; j++)
+;     s[i] = s[i] + a[j][i] * b[j][i];
+;   s[i] += 1;
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIInner
+; CHECK-NEXT: Function:        simple_reduction_03
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_03(
+; IR-NOT: split
+define void @simple_reduction_03(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  br label %innerloop
+
+innerloop:
+  %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+  %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+  %lcssa = phi double [ %add, %innerloop ]
+  store double %lcssa, ptr %addr_s, align 8
+  %add17.us = fadd fast double %lcssa, 1.000000e+00
+  store double %add17.us, ptr %addr_s, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+  ret void
+}
+
+
+; 4. The reduction is not in the innermost loop.
+; for (int i = 0; i < n; i++) {
+;   s[i] = 0;
+;   for (int j = 0; j < n; j++) {
+;     s[i] = s[i] + a[j][i] * b[j][i]; // reduction
+;     for (int k = 0; k < n; k++)
+;       c[k] = 1;
+
+;   }
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only outer loops with induction or reduction PHI nodes can be interchanged currently.
+
+; IR-LABEL: @simple_reduction_04(
+; IR-NOT: split
+define void @simple_reduction_04(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %c, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %i_loop_header, label %exit
+
+i_loop_header:
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %i_loop_latch ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  br label %j_loop
+
+j_loop:
+  %index_j = phi i64 [ 0, %i_loop_header ], [ %index_j.next, %j_loop_latch ]
+  %reduction = phi double [ 0.000000e+00, %i_loop_header ], [ %add, %j_loop_latch ]
+  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  br label %k_loop
+  
+k_loop:                                 
+  %index_k = phi i64 [ %index_k.next, %k_loop ], [ 0, %j_loop ]
+  %arrayidx22.us.us = getelementptr inbounds nuw double, ptr %c, i64 %index_k
+  ; store double 1.000000e+00, ptr %arrayidx22.us.us, align 8 // Avoid unrelated store instructions from affecting the interchange of the i-loop and j-loop
+  %index_k.next = add nuw nsw i64 %index_k, 1
+  %exitcond.not = icmp eq i64 %index_k.next, %n
+  br i1 %exitcond.not, label %j_loop_latch, label %k_loop
+
+j_loop_latch:    
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %i_loop_latch, label %j_loop
+
+i_loop_latch:
+  %lcssa = phi double [ %add, %j_loop_latch ]
+  store double %lcssa, ptr %addr_s, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %i_loop_header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
index 9a4393f827a36..d16f07b6b084e 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -1,5 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; NOTE: Support simple reduction in the inner loop by undoing the simple reduction.
-; RUN: opt < %s -passes="loop(loop-interchange),dce"  -undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
+; RUN: opt < %s -passes="loop-interchange"  -loop-interchange-undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
 
 ; for (int i = 0; i < n; i++) {
 ;   s[i] = 0;
@@ -24,6 +25,7 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
 ; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
 ; CHECK:       innerloop:
 ; CHECK-NEXT:    [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[REDUCTION_Dead:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
 ; CHECK:       innerloop.split1:
 ; CHECK-NEXT:    [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
@@ -36,8 +38,12 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
 ; CHECK-NEXT:    store double [[ADD]], ptr [[ADDR_S]], align 8
+; CHECK-NEXT:    [[DEAD_J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
+; CHECK-NEXT:    [[DEAD_COND:%.*]] = icmp eq i64 [[DEAD_J_NEXT]], [[N]]
 ; CHECK-NEXT:    br label [[OUTERLOOP_LATCH:%.*]]
 ; CHECK:       innerloop.split:
+; CHECK-NEXT:    [[DEAD_ADD_LCSSA:%.*]] = phi double [ [[ADD]], [[OUTERLOOP_LATCH]] ]
+; CHECK-NEXT:    [[DEAD_LCSSA:%.*]] = phi double [ [[ADD]], [[OUTERLOOP_LATCH]] ]
 ; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[J_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT_LOOPEXIT:%.*]], label [[INNERLOOP]]

>From 7842fbf3835e71f489c64c6dad6b0d4f1b9cc718 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 18:05:27 +0800
Subject: [PATCH 06/10] correct the clang-format

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 7cd6744c34528..1b72530a79ae7 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -953,7 +953,7 @@ static Value *followLCSSA(Value *SV) {
 }
 
 static bool CheckReductionKind(Loop *L, PHINode *PHI,
-                        SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+                               SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
   RecurrenceDescriptor RD;
   if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
     // Detect floating point reduction only when it can be reordered.
@@ -1043,8 +1043,8 @@ findInnerReductionPhi(Loop *L, Value *V,
     }
   }
 
-      return nullptr;
-    }
+  return nullptr;
+}
 
 static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
   Instruction *IncI = dyn_cast<Instruction>(IncV);
@@ -1157,7 +1157,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
     return false;
 
   // Find lcssa_phi in OuterLoop's Latch
-  BasicBlock *ExitBlock = L->getExitBlock();;
+  BasicBlock *ExitBlock = L->getExitBlock();
   if (!ExitBlock)
     return false;
 

>From 6f488cd9da5f28cc6b48d72564c6b9c76609b450 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 18:22:29 +0800
Subject: [PATCH 07/10] remove unused variable.

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1b72530a79ae7..bfb3fe2af5f4c 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1921,7 +1921,6 @@ void LoopInterchangeTransform::undoSimpleReduction() {
   Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
 
   // Check if it's the first iteration.
-  auto &InductionPHIs = LIL.getInnerLoopInductions();
   PHINode *IV = SR.CounterIV;
   Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
   Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");

>From 4f58163dee1e5fd0938d82e9458c6616b6a04212 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 17:24:51 +0800
Subject: [PATCH 08/10] Add one negative test and improve security

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 152 +++++++---------
 .../simple-reduction-limitation.ll            | 172 +++++++++++++-----
 .../LoopInterchange/simple-reduction.ll       |  29 +--
 3 files changed, 208 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index bfb3fe2af5f4c..0789735625934 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -496,12 +496,9 @@ class LoopInterchangeLegality {
     // The memory Location
     Value *MemRef;
     Type *ElemTy;
-
-    /// IV used for the loop exit condition.
-    PHINode *CounterIV;
   };
 
-  const ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
+  ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
     return InnerSimpleReductions;
   }
 
@@ -952,7 +949,7 @@ static Value *followLCSSA(Value *SV) {
   return followLCSSA(PHI->getIncomingValue(0));
 }
 
-static bool CheckReductionKind(Loop *L, PHINode *PHI,
+static bool checkReductionKind(Loop *L, PHINode *PHI,
                                SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
   RecurrenceDescriptor RD;
   if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
@@ -1036,7 +1033,7 @@ findInnerReductionPhi(Loop *L, Value *V,
       if (PHI->getNumIncomingValues() == 1)
         continue;
 
-      if (CheckReductionKind(L, PHI, HasNoWrapInsts))
+      if (checkReductionKind(L, PHI, HasNoWrapInsts))
         return PHI;
       else
         return nullptr;
@@ -1046,65 +1043,6 @@ findInnerReductionPhi(Loop *L, Value *V,
   return nullptr;
 }
 
-static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
-  Instruction *IncI = dyn_cast<Instruction>(IncV);
-  if (!IncI)
-    return nullptr;
-
-  if (IncI->getOpcode() != Instruction::Add &&
-      IncI->getOpcode() != Instruction::Sub)
-    return nullptr;
-
-  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
-  if (Phi && Phi->getParent() == L->getHeader()) {
-    return Phi;
-  }
-
-  // Allow add/sub to be commuted.
-  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
-  if (Phi && Phi->getParent() == L->getHeader()) {
-    return Phi;
-  }
-
-  return nullptr;
-}
-
-/// UndoSimpleReduction requires the first_iteration check, so look for
-/// the IV used for the loop exit condition
-static PHINode *findCounterIV(Loop *L) {
-
-  assert(L->getLoopLatch() && "Must be in simplified form");
-
-  BranchInst *BI = cast<BranchInst>(L->getLoopLatch()->getTerminator());
-  if (L->isLoopInvariant(BI->getCondition()))
-    return nullptr;
-
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
-  if (!Cond)
-    return nullptr;
-
-  // Look for a loop invariant RHS
-  Value *LHS = Cond->getOperand(0);
-  Value *RHS = Cond->getOperand(1);
-  if (!L->isLoopInvariant(RHS)) {
-    if (!L->isLoopInvariant(LHS))
-      return nullptr;
-    std::swap(LHS, RHS);
-  }
-
-  // IndVar = phi[{InitialValue, preheader}, {StepInst, latch}]
-  // StepInst = IndVar + step
-  // case 1:
-  // cmp = IndVar < FinalValue
-  PHINode *Counter = dyn_cast<PHINode>(LHS);
-  // case 2:
-  // cmp = StepInst < FinalValue
-  if (!Counter)
-    Counter = getCounterFromInc(LHS, L);
-
-  return Counter;
-}
-
 /// Detect and record the simple reduction of the inner loop.
 ///
 ///    innerloop:
@@ -1121,8 +1059,15 @@ bool LoopInterchangeLegality::findSimpleReduction(
 
   // Only support undo simple reduction if the loop nest to be interchanged is
   // the innermostin two loops.
-  if (!L->isInnermost())
+  if (!L->isInnermost()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+                                      L->getStartLoc(), L->getHeader())
+             << "Cannot undo a reduction when the loop is not the innermost "
+                "loop.";
+    });
     return false;
+  }
 
   if (Phi->getNumIncomingValues() != 2)
     return false;
@@ -1131,8 +1076,14 @@ bool LoopInterchangeLegality::findSimpleReduction(
   Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
 
   // So far only supports constant initial value.
-  if (!isa<Constant>(Init))
+  if (!isa<Constant>(Init)) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+                                      L->getStartLoc(), L->getHeader())
+             << "Cannot undo a reduction with non-constant initial value.";
+    });
     return false;
+  }
 
   // The reduction result must live in the inner loop.
   if (Instruction *I = dyn_cast<Instruction>(Next)) {
@@ -1153,7 +1104,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
     return false;
 
   // Check the reduction kind.
-  if (ReUser != Next && !CheckReductionKind(L, Phi, HasNoWrapInsts))
+  if (ReUser != Next && !checkReductionKind(L, Phi, HasNoWrapInsts))
     return false;
 
   // Find lcssa_phi in OuterLoop's Latch
@@ -1175,27 +1126,42 @@ bool LoopInterchangeLegality::findSimpleReduction(
     } else
       return false;
   }
-  if (!Lcssa || !Lcssa->hasOneUser())
+  if (!Lcssa)
     return false;
 
+  if (!Lcssa->hasOneUser()) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+                                      L->getStartLoc(), L->getHeader())
+             << "Cannot undo a reduction when the reduction is used more than "
+                "once in the outer loop.";
+    });
+    return false;
+  }
+
   StoreInst *LcssaStorer =
       dyn_cast<StoreInst>(Lcssa->getUniqueUndroppableUser());
-  if (!LcssaStorer)
+  if (!LcssaStorer || LcssaStorer->getParent() != ExitBlock)
     return false;
 
   Value *MemRef = LcssaStorer->getOperand(1);
   Type *ElemTy = LcssaStorer->getOperand(0)->getType();
 
-  // LcssaStorer stores the reduction result in BB. undoSimpleReduction() will
-  // move it into the inner loop. Here we must ensure that the memory reference
-  // and its operands dominate the target block; otherwise the move is unsafe.
-  if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
-    return false;
-
-  // find the IV used for the loop exit condition.
-  PHINode *CounterIV = findCounterIV(L);
-  if (!CounterIV)
+  // LcssaStorer stores the reduction result in BB.
+  // When the reduction is initialized from a constant value, we need to load
+  // from the memory object into the target basic block of the inner loop during
+  // the undoing of the reduction. This means the memory reference was used
+  // prematurely. So we must ensure that the memory reference does not dominate
+  // the target basic block.
+  if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+                                      L->getStartLoc(), L->getHeader())
+             << "Cannot undo a reduction when memory reference does not "
+                "dominate the inner loop.";
+    });
     return false;
+  }
 
   // Found a simple reduction of inner loop.
   SimpleReduction SR;
@@ -1206,7 +1172,6 @@ bool LoopInterchangeLegality::findSimpleReduction(
   SR.LcssaStorer = LcssaStorer;
   SR.MemRef = MemRef;
   SR.ElemTy = ElemTy;
-  SR.CounterIV = CounterIV;
 
   InnerSimpleReductions.push_back(SR);
   return true;
@@ -1254,8 +1219,15 @@ bool LoopInterchangeLegality::findInductionAndReductions(
   }
 
   // For now we only support at most one reduction.
-  if (InnerSimpleReductions.size() > 1)
+  if (InnerSimpleReductions.size() > 1){
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+                                      L->getStartLoc(), L->getHeader())
+             << "Cannot undo a reduction with two or more reductions.";
+    });
     return false;
+  }
+
   return true;
 }
 
@@ -1915,16 +1887,21 @@ void LoopInterchangeTransform::undoSimpleReduction() {
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
 
+  // Check if it's the first iteration.
+  LLVMContext &Context = InnerLoopHeader->getContext();
+  PHINode *FirstIter =
+      Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
+  FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
+                         InnerLoop->getLoopPreheader());
+  FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
+                         InnerLoop->getLoopLatch());
+  assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
+
   // When the reduction is intialized from constant value, we need to add
   // a stmt loading from the memory object to target basic block in inner
   // loop during undoing the reduction.
   Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
 
-  // Check if it's the first iteration.
-  PHINode *IV = SR.CounterIV;
-  Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
-  Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
-
   // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
   Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR.Init, "new.var");
 
@@ -1940,7 +1917,8 @@ bool LoopInterchangeTransform::transform(
     ArrayRef<Instruction *> DropNoWrapInsts) {
   bool Transformed = false;
 
-  auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+  ArrayRef<LoopInterchangeLegality::SimpleReduction> InnerSimpleReductions =
+      LIL.getInnerSimpleReductions();
   if (EnableUndoSimpleReduction && InnerSimpleReductions.size() == 1)
     undoSimpleReduction();
 
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
index f4a2266ef9ffe..dee19ed11bd1b 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -1,15 +1,23 @@
 ; Several cases of undoing simple reductions that have not yet been supported.
 ; RUN: opt < %s -passes="loop-interchange"  -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
-; RUN:            -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
-; RUN: FileCheck --input-file=%t %s
+; RUN:            -pass-remarks-output=%t -S | FileCheck --input-file=%t %s
 
 
 ; 1. The initial value of the reduction is not a constant.
 ; for (int i = 0; i < n; i++) {
+;   r = s[i];
 ;   for (int j = 0; j < n; j++)
-;     s[i] = s[i] + a[j][i] * b[j][i];
+;     r = r + a[j][i] * b[j][i];
+;   s[i] = r;
 ; }
 
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedSimpleReduction
+; CHECK-NEXT: Function:        simple_reduction_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot undo a reduction with non-constant initial value.
+
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            UnsupportedPHIInner
@@ -17,9 +25,7 @@
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
-; IR-LABEL: @simple_reduction_01(
-; IR-NOT: split
-define void @simple_reduction_01(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+define void @simple_reduction_01(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %outerloop_header, label %exit
@@ -27,17 +33,17 @@ entry:
 outerloop_header:
   %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
   %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
-  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
-  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
   %s_init = load double, ptr %addr_s, align 8
   br label %innerloop
 
 innerloop:
   %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
   %reduction = phi double [ %s_init, %outerloop_header ], [ %add, %innerloop ]
-  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
   %0 = load double, ptr %addr_a_j_i, align 8
-  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
   %1 = load double, ptr %addr_b_j_i, align 8
   %mul = fmul fast double %1, %0
   %add = fadd fast double %mul, %reduction
@@ -58,14 +64,22 @@ exit:
 
 ; 2. There are two or more reductions
 ; for (int i = 0; i < n; i++) {
-;   s[i] = 0;
-;   s2[i] = 0;
+;   r1 = 0;
+;   r2 = 0;
 ;   for (int j = 0; j < n; j++){
-;     s[i] = s[i] + a[j][i] * b[j][i];
-;     s2[i] = s2[i] + a[j][i];
+;     r1 = r1 + a[j][i] * b[j][i];
+;     r2 = r2 + a[j][i];
 ;   }
+;   s[i] = r1;
+;   s2[i] = r2;
 ; }
 
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedSimpleReduction
+; CHECK-NEXT: Function:        simple_reduction_02
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot undo a reduction with two or more reductions.
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            UnsupportedPHIInner
@@ -73,9 +87,7 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
-; IR-LABEL: @simple_reduction_02(
-; IR-NOT: split
-define void @simple_reduction_02(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, ptr noalias noundef writeonly captures(none) %s2, i64  noundef %n) {
+define void @simple_reduction_02(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, ptr noalias writeonly %s2, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %outerloop_header, label %exit
@@ -84,17 +96,17 @@ outerloop_header:
   %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
   %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
   %addr_s2 = getelementptr inbounds nuw double, ptr %s2, i64 %index_i
-  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
-  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
   br label %innerloop
 
 innerloop:
   %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
   %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
-  %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
-  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add2, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
   %0 = load double, ptr %addr_a_j_i, align 8
-  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
   %1 = load double, ptr %addr_b_j_i, align 8
   %mul = fmul fast double %1, %0
   %add = fadd fast double %mul, %reduction
@@ -116,14 +128,21 @@ exit:
   ret void
 }
 
-; 3. The reduction is used more than twice in the outer loop.
+; 3. The reduction is used more than once in the outer loop.
 ; for (int i = 0; i < n; i++) {
-;   s[i] = 0;
+;   r = 0;
 ;   for (int j = 0; j < n; j++)
-;     s[i] = s[i] + a[j][i] * b[j][i];
-;   s[i] += 1;
+;     r = r + a[j][i] * b[j][i];
+;   r += 1;
+;   s[i] = r;
 ; }
 
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedSimpleReduction
+; CHECK-NEXT: Function:        simple_reduction_03
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot undo a reduction when the reduction is used more than once in the outer loop.
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            UnsupportedPHIInner
@@ -131,9 +150,7 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
-; IR-LABEL: @simple_reduction_03(
-; IR-NOT: split
-define void @simple_reduction_03(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+define void @simple_reduction_03(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %outerloop_header, label %exit
@@ -141,16 +158,16 @@ entry:
 outerloop_header:
   %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
   %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
-  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
-  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
   br label %innerloop
 
 innerloop:
   %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
   %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
-  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
   %0 = load double, ptr %addr_a_j_i, align 8
-  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
   %1 = load double, ptr %addr_b_j_i, align 8
   %mul = fmul fast double %1, %0
   %add = fadd fast double %mul, %reduction
@@ -174,13 +191,13 @@ exit:
 
 ; 4. The reduction is not in the innermost loop.
 ; for (int i = 0; i < n; i++) {
-;   s[i] = 0;
+;   r = 0;
 ;   for (int j = 0; j < n; j++) {
-;     s[i] = s[i] + a[j][i] * b[j][i]; // reduction
+;     r = r + a[j][i] * b[j][i]; // reduction
 ;     for (int k = 0; k < n; k++)
 ;       c[k] = 1;
-
 ;   }
+;   s[i] = r;
 ; }
 
 ; CHECK: --- !Missed
@@ -189,10 +206,20 @@ exit:
 ; CHECK-NEXT: Function:        simple_reduction_04
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only outer loops with induction or reduction PHI nodes can be interchanged currently.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedSimpleReduction
+; CHECK-NEXT: Function:        simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot undo a reduction when the loop is not the innermost loop.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIInner
+; CHECK-NEXT: Function:        simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
-; IR-LABEL: @simple_reduction_04(
-; IR-NOT: split
-define void @simple_reduction_04(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %c, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+define void @simple_reduction_04(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %c, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %i_loop_header, label %exit
@@ -200,16 +227,16 @@ entry:
 i_loop_header:
   %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %i_loop_latch ]
   %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
-  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
-  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
   br label %j_loop
 
 j_loop:
   %index_j = phi i64 [ 0, %i_loop_header ], [ %index_j.next, %j_loop_latch ]
   %reduction = phi double [ 0.000000e+00, %i_loop_header ], [ %add, %j_loop_latch ]
-  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
   %0 = load double, ptr %addr_a_j_i, align 8
-  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
   %1 = load double, ptr %addr_b_j_i, align 8
   %mul = fmul fast double %1, %0
   %add = fadd fast double %mul, %reduction
@@ -218,7 +245,6 @@ j_loop:
 k_loop:                                 
   %index_k = phi i64 [ %index_k.next, %k_loop ], [ 0, %j_loop ]
   %arrayidx22.us.us = getelementptr inbounds nuw double, ptr %c, i64 %index_k
-  ; store double 1.000000e+00, ptr %arrayidx22.us.us, align 8 // Avoid unrelated store instructions from affecting the interchange of the i-loop and j-loop
   %index_k.next = add nuw nsw i64 %index_k, 1
   %exitcond.not = icmp eq i64 %index_k.next, %n
   br i1 %exitcond.not, label %j_loop_latch, label %k_loop
@@ -238,3 +264,61 @@ i_loop_latch:
 exit:
   ret void
 }
+
+
+; 5. MemRef doesn't dominate InnerLoop's HeaderBB.
+; for (int i = 0; i < n; i++) {
+;   r = 0;
+;   for (int j = 0; j < n; j++)
+;     r = r + a[j][i] * b[j][i];
+;   s[i] = r;
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedSimpleReduction
+; CHECK-NEXT: Function:        simple_reduction_05
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot undo a reduction when memory reference does not dominate the inner loop.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIInner
+; CHECK-NEXT: Function:        simple_reduction_05
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+define void @simple_reduction_05(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+  %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
+  br label %innerloop
+
+innerloop:
+  %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+  %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
+  %0 = load double, ptr %addr_a_j_i, align 8
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
+  %1 = load double, ptr %addr_b_j_i, align 8
+  %mul = fmul fast double %1, %0
+  %add = fadd fast double %mul, %reduction
+  %index_j.next = add nuw nsw i64 %index_j, 1
+  %cond1 = icmp eq i64 %index_j.next, %n
+  br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+  %lcssa = phi double [ %add, %innerloop ]
+  %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+  store double %lcssa, ptr %addr_s, align 8
+  %index_i.next = add nuw nsw i64 %index_i, 1
+  %cond2 = icmp eq i64 %index_i.next, %n
+  br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
index d16f07b6b084e..db37c64013232 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -3,13 +3,14 @@
 ; RUN: opt < %s -passes="loop-interchange"  -loop-interchange-undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
 
 ; for (int i = 0; i < n; i++) {
-;   s[i] = 0;
+;   r = 0;
 ;   for (int j = 0; j < n; j++)
-;     s[i] = s[i] + a[j][i] * b[j][i];
+;     r = r + a[j][i] * b[j][i];
+;   s[i] = r;
 ; }
 
-define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
-; CHECK-LABEL: define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64  noundef %n) {
+define void @func(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
+; CHECK-LABEL: define void @func(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[INNERLOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
@@ -18,22 +19,22 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
 ; CHECK:       outerloop_header:
 ; CHECK-NEXT:    [[INDEX_I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[OUTERLOOP_LATCH:%.*]] ], [ 0, [[OUTERLOOPHEADER_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[ADDR_S:%.*]] = getelementptr inbounds nuw double, ptr %s, i64 [[INDEX_I]]
-; CHECK-NEXT:    [[ADDR_A:%.*]] = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 [[INDEX_I]]
-; CHECK-NEXT:    [[ADDR_B:%.*]] = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT:    [[ADDR_A:%.*]] = getelementptr inbounds nuw double, ptr %a, i64 [[INDEX_I]]
+; CHECK-NEXT:    [[ADDR_B:%.*]] = getelementptr inbounds nuw double, ptr %b, i64 [[INDEX_I]]
 ; CHECK-NEXT:    br label [[INNERLOOP_SPLIT1:%.*]]
 ; CHECK:       innerloop.preheader:
 ; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
 ; CHECK:       innerloop:
 ; CHECK-NEXT:    [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[REDUCTION_Dead:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[DEAD_REDUCTION:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[FIRSTITER:%.*]] = phi i1 [ false, [[INNERLOOP_SPLIT:%.*]] ], [ true, [[INNERLOOP_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
 ; CHECK:       innerloop.split1:
 ; CHECK-NEXT:    [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
-; CHECK-NEXT:    [[FIRSTITER:%.*]] = icmp ne i64 [[INDEX_J]], 0
 ; CHECK-NEXT:    [[NEW_VAR:%.*]] = select i1 [[FIRSTITER]], double [[S]], double 0.000000e+00
-; CHECK-NEXT:    [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_A]], i64 [[INDEX_J]]
+; CHECK-NEXT:    [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw double, ptr [[ADDR_A]], i64 [[INDEX_J]]
 ; CHECK-NEXT:    [[A_J_I:%.*]] = load double, ptr [[ADDR_A_J_I]], align 8
-; CHECK-NEXT:    [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_B]], i64 [[INDEX_J]]
+; CHECK-NEXT:    [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw double, ptr [[ADDR_B]], i64 [[INDEX_J]]
 ; CHECK-NEXT:    [[B_J_I:%.*]] = load double, ptr [[ADDR_B_J_I]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
@@ -63,16 +64,16 @@ entry:
 outerloop_header:                                      
   %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
   %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
-  %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
-  %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+  %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+  %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
   br label %innerloop
 
 innerloop:                                     
   %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
   %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
-  %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+  %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
   %0 = load double, ptr %addr_a_j_i, align 8
-  %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+  %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
   %1 = load double, ptr %addr_b_j_i, align 8
   %mul = fmul fast double %1, %0
   %add = fadd fast double %mul, %reduction

>From aa84d82fcff25f804c643f8c95958d4d6fbad0ae Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 17:28:00 +0800
Subject: [PATCH 09/10] correct the format

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 0789735625934..12599ae4970d5 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1219,7 +1219,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
   }
 
   // For now we only support at most one reduction.
-  if (InnerSimpleReductions.size() > 1){
+  if (InnerSimpleReductions.size() > 1) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
                                       L->getStartLoc(), L->getHeader())

>From b159dc43d98e32ebce50e788c82d1c3410fddafb Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 18:06:34 +0800
Subject: [PATCH 10/10] fix test

---
 .../LoopInterchange/simple-reduction-limitation.ll  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
index dee19ed11bd1b..7218415fa81c0 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -1,6 +1,7 @@
 ; Several cases of undoing simple reductions that have not yet been supported.
 ; RUN: opt < %s -passes="loop-interchange"  -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
-; RUN:            -pass-remarks-output=%t -S | FileCheck --input-file=%t %s
+; RUN:            -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
 
 
 ; 1. The initial value of the reduction is not a constant.
@@ -25,6 +26,8 @@
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
+; IR-LABEL: @simple_reduction_01(
+; IR-NOT: split
 define void @simple_reduction_01(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
@@ -87,6 +90,8 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
+; IR-LABEL: @simple_reduction_02(
+; IR-NOT: split
 define void @simple_reduction_02(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, ptr noalias writeonly %s2, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
@@ -150,6 +155,8 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
+; IR-LABEL: @simple_reduction_03(
+; IR-NOT: split
 define void @simple_reduction_03(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
@@ -219,6 +226,8 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
+; IR-LABEL: @simple_reduction_04(
+; IR-NOT: split
 define void @simple_reduction_04(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %c, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0
@@ -287,6 +296,8 @@ exit:
 ; CHECK-NEXT: Args:
 ; CHECK-NEXT:   - String:          Only inner loops with induction or reduction PHI nodes can be interchange currently.
 
+; IR-LABEL: @simple_reduction_05(
+; IR-NOT: split
 define void @simple_reduction_05(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64  %n) {
 entry:
   %cmp = icmp sgt i64 %n, 0