[llvm] [LoopInterchange] Drop nuw/nsw flags from reduction ops when interchanging (PR #148612)

Tue Jul 15 05:22:13 PDT 2025

https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/148612

>From 34ce36c9bb657f1cb3ea93af33f387efd497adf8 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Mon, 14 Jul 2025 00:15:42 +0000
Subject: [PATCH 1/5] [LoopInterchange] Reject interchange if non-reassociative
 reduction exists

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp |  57 +-
 .../Transforms/LoopInterchange/pr48212.ll     |   2 +-
 .../LoopInterchange/reductions-kind.ll        | 864 ++++++++++++++++++
 3 files changed, 921 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/reductions-kind.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a5008907b9014..a2aa72e1a01f2 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -812,7 +812,62 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
         // Detect floating point reduction only when it can be reordered.
         if (RD.getExactFPMathInst() != nullptr)
           return nullptr;
-        return PHI;
+
+        RecurKind RK = RD.getRecurrenceKind();
+        switch (RK) {
+        case RecurKind::Or:
+        case RecurKind::And:
+        case RecurKind::Xor:
+        case RecurKind::SMin:
+        case RecurKind::SMax:
+        case RecurKind::UMin:
+        case RecurKind::UMax:
+        case RecurKind::FAdd:
+        case RecurKind::FMul:
+        case RecurKind::FMin:
+        case RecurKind::FMax:
+        case RecurKind::FMinimum:
+        case RecurKind::FMaximum:
+        case RecurKind::FMinimumNum:
+        case RecurKind::FMaximumNum:
+        case RecurKind::FMulAdd:
+        case RecurKind::AnyOf:
+          return PHI;
+
+        // Change the order of integer addition/multiplication may change the
+        // semantics. Consider the following case:
+        //
+        //  int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
+        //  int sum = 0;
+        //  for (int i = 0; i < 2; i++)
+        //    for (int j = 0; j < 2; j++)
+        //      sum += A[j][i];
+        //
+        // If the above loops are exchanged, the addition will cause an
+        // overflow. To prove the legality, we must ensure that all reduction
+        // operations don't have nuw/nsw flags.
+        case RecurKind::Add:
+        case RecurKind::Mul: {
+          unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
+          SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
+
+          // FIXME: Is this check necessary?
+          if (Ops.empty())
+            return nullptr;
+          for (Instruction *I : Ops) {
+            // FIXME: Is this check necessary?
+            if (I->getOpcode() != OpCode)
+              return nullptr;
+
+            // Reject if the reduction operation has nuw/nsw flags.
+            if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
+              return nullptr;
+          }
+          return PHI;
+        }
+        default:
+          return nullptr;
+        }
       }
       return nullptr;
     }
diff --git a/llvm/test/Transforms/LoopInterchange/pr48212.ll b/llvm/test/Transforms/LoopInterchange/pr48212.ll
index 936c53e217540..cb1300846cf0f 100644
--- a/llvm/test/Transforms/LoopInterchange/pr48212.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr48212.ll
@@ -38,7 +38,7 @@ for.body3:                                        ; preds = %L2, %for.inc
   %idxprom4 = sext i32 %k1.03 to i64
   %arrayidx5 = getelementptr inbounds [5 x i32], ptr %arrayidx, i64 0, i64 %idxprom4
   %0 = load i32, ptr %arrayidx5
-  %add = add nsw i32 %temp.12, %0
+  %add = add i32 %temp.12, %0
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body3
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-kind.ll b/llvm/test/Transforms/LoopInterchange/reductions-kind.ll
new file mode 100644
index 0000000000000..d9e4d58a1780e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/reductions-kind.ll
@@ -0,0 +1,864 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
+; RUN: FileCheck -input-file=%t %s
+
+; int sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_add
+define void @reduction_add(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %sum.j.next = add nsw i32 %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_wrap_add
+define void @reduction_wrap_add(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %sum.j.next = add i32 %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_cast_add
+define void @reduction_cast_add(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %sum.j.trunc = trunc i32 %sum.j to i16
+  %sum.j.ext = zext i16 %sum.j.trunc to i32
+  %sum.j.next = add nsw i32 %sum.j.ext, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_mul
+define void @reduction_mul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %prod.j.next = mul nsw i32 %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_wrap_mul
+define void @reduction_wrap_mul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %prod.j.next = mul i32 %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int b_or = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_or |= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_or
+define void @reduction_or(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %or.i = phi i32 [ 0, %entry ], [ %or.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %or.j = phi i32 [ %or.i, %for.i.header ], [ %or.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %or.j.next = or i32 %or.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %or.i.lcssa = phi i32 [ %or.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int b_and = -1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_and &= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_and
+define void @reduction_and(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %and.i = phi i32 [ -1, %entry ], [ %and.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %and.j = phi i32 [ %and.i, %for.i.header ], [ %and.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %and.j.next = and i32 %and.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %and.i.lcssa = phi i32 [ %and.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int b_xor = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_xor ^= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_xor
+define void @reduction_xor(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %xor.i = phi i32 [ 0, %entry ], [ %xor.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %xor.j = phi i32 [ %xor.i, %for.i.header ], [ %xor.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %xor.j.next = xor i32 %xor.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %xor.i.lcssa = phi i32 [ %xor.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int smin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smin = (A[j][i] < smin) ? A[j][i] : smin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_smin
+define void @reduction_smin(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %smin.i = phi i32 [ %init, %entry ], [ %smin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %smin.j = phi i32 [ %smin.i, %for.i.header ], [ %smin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp slt i32 %a, %smin.j
+  %smin.j.next = select i1 %cmp, i32 %a, i32 %smin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %smin.i.lcssa = phi i32 [ %smin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int smax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smax = (A[j][i] > smax) ? A[j][i] : smax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_smax
+define void @reduction_smax(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %smax.i = phi i32 [ %init, %entry ], [ %smax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %smax.j = phi i32 [ %smax.i, %for.i.header ], [ %smax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp sgt i32 %a, %smax.j
+  %smax.j.next = select i1 %cmp, i32 %a, i32 %smax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %smax.i.lcssa = phi i32 [ %smax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; unsigned umin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     umin = (A[j][i] < umin) ? A[j][i] : umin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_umin
+define void @reduction_umin(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %umin.i = phi i32 [ %init, %entry ], [ %umin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %umin.j = phi i32 [ %umin.i, %for.i.header ], [ %umin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp ult i32 %a, %umin.j
+  %umin.j.next = select i1 %cmp, i32 %a, i32 %umin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %umin.i.lcssa = phi i32 [ %umin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; unsigned umax = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smax = (A[j][i] > smax) ? A[j][i] : smax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_umax
+define void @reduction_umax(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %umax.i = phi i32 [ 0, %entry ], [ %umax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %umax.j = phi i32 [ %umax.i, %for.i.header ], [ %umax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp ugt i32 %a, %umax.j
+  %umax.j.next = select i1 %cmp, i32 %a, i32 %umax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %umax.i.lcssa = phi i32 [ %umax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; int any_of = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     any_of = (A[j][i] == 42) ? 1 : any_of;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_anyof
+define void @reduction_anyof(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %anyof.i = phi i32 [ 0, %entry ], [ %anyof.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %anyof.j = phi i32 [ %anyof.i, %for.i.header ], [ %anyof.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp eq i32 %a, 42
+  %anyof.j.next = select i1 %cmp, i32 1, i32 %anyof.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %anyof.i.lcssa = phi i32 [ %anyof.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; float sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fadd
+define void @reduction_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fadd
+define void @reduction_reassoc_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; float prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmul
+define void @reduction_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmul
+define void @reduction_reassoc_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul reassoc float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; float fmuladd = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmuladd += A[j][i] * B[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmuladd
+define void @reduction_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
+define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; float fmin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmin
+define void @reduction_fmin(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz olt float %a, %fmin.j
+  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmininumnum
+define void @reduction_fmininumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; float fmax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmax
+define void @reduction_fmax(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
+  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmaxinumnum
+define void @reduction_fmaxinumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
+declare float @llvm.minimumnum.f32(float %a, float %b)
+declare float @llvm.maximumnum.f32(float %a, float %b)

>From 770ccd44913a428a6a56903de009e90ecd54df3a Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 15 Jul 2025 07:38:16 +0000
Subject: [PATCH 2/5] Drop nuw/nsw flags from reductions

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp |  41 +++-
 .../Transforms/LoopInterchange/pr48212.ll     |   2 +-
 ...l => reductions-non-wrapped-operations.ll} | 225 ++++--------------
 .../reductions-with-nowraps.ll                | 144 +++++++++++
 4 files changed, 223 insertions(+), 189 deletions(-)
 rename llvm/test/Transforms/LoopInterchange/{reductions-kind.ll => reductions-non-wrapped-operations.ll} (79%)
 create mode 100644 llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a2aa72e1a01f2..ba3d60980da6f 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -379,6 +379,10 @@ class LoopInterchangeLegality {
     return InnerLoopInductions;
   }
 
+  ArrayRef<Instruction *> getHasNoWrapReductions() const {
+    return HasNoWrapReductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
@@ -405,6 +409,11 @@ class LoopInterchangeLegality {
 
   /// Set of inner loop induction PHIs
   SmallVector<PHINode *, 8> InnerLoopInductions;
+
+  /// Hold instructions that have nuw/nsw flags and involved in reductions,
+  /// like integer addition/multiplication. Those flags must be dropped when
+  /// exchanging the loops.
+  SmallVector<Instruction *, 4> HasNoWrapReductions;
 };
 
 /// Manages information utilized by the profitability check for cache. The main
@@ -473,7 +482,7 @@ class LoopInterchangeTransform {
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
 
   /// Interchange OuterLoop and InnerLoop.
-  bool transform();
+  bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
   void restructureLoops(Loop *NewInner, Loop *NewOuter,
                         BasicBlock *OrigInnerPreHeader,
                         BasicBlock *OrigOuterPreHeader);
@@ -613,7 +622,7 @@ struct LoopInterchange {
     });
 
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
-    LIT.transform();
+    LIT.transform(LIL.getHasNoWrapReductions());
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
 
@@ -798,7 +807,9 @@ static Value *followLCSSA(Value *SV) {
 }
 
 // Check V's users to see if it is involved in a reduction in L.
-static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+static PHINode *
+findInnerReductionPhi(Loop *L, Value *V,
+                      SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
   // Reduction variables cannot be constants.
   if (isa<Constant>(V))
     return nullptr;
@@ -844,8 +855,9 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
         //      sum += A[j][i];
         //
         // If the above loops are exchanged, the addition will cause an
-        // overflow. To prove the legality, we must ensure that all reduction
-        // operations don't have nuw/nsw flags.
+        // overflow. To prevent this, we must drop the nuw/nsw flags from the
+        // addition/multiplication instructions when we actually exchanges the
+        // loops.
         case RecurKind::Add:
         case RecurKind::Mul: {
           unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
@@ -859,12 +871,14 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
             if (I->getOpcode() != OpCode)
               return nullptr;
 
-            // Reject if the reduction operation has nuw/nsw flags.
+            // If the instruction has nuw/nsw flags, we must drop them when the
+            // transformation is actually performed.
             if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
-              return nullptr;
+              HasNoWrapInsts.push_back(I);
           }
           return PHI;
         }
+
         default:
           return nullptr;
         }
@@ -899,7 +913,8 @@ bool LoopInterchangeLegality::findInductionAndReductions(
         // Check if we have a PHI node in the outer loop that has a reduction
         // result from the inner loop as an incoming value.
         Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
-        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+        PHINode *InnerRedPhi =
+            findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
         if (!InnerRedPhi ||
             !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
           LLVM_DEBUG(
@@ -1485,7 +1500,8 @@ void LoopInterchangeTransform::restructureLoops(
   SE->forgetLoop(NewOuter);
 }
 
-bool LoopInterchangeTransform::transform() {
+bool LoopInterchangeTransform::transform(
+    ArrayRef<Instruction *> DropNoWrapInsts) {
   bool Transformed = false;
 
   if (InnerLoop->getSubLoops().empty()) {
@@ -1586,6 +1602,13 @@ bool LoopInterchangeTransform::transform() {
     return false;
   }
 
+  // Finally, drop the nsw/nuw flags from the instructions for reduction
+  // calculations.
+  for (Instruction *Reduction : DropNoWrapInsts) {
+    Reduction->setHasNoSignedWrap(false);
+    Reduction->setHasNoUnsignedWrap(false);
+  }
+
   return true;
 }
 
diff --git a/llvm/test/Transforms/LoopInterchange/pr48212.ll b/llvm/test/Transforms/LoopInterchange/pr48212.ll
index cb1300846cf0f..936c53e217540 100644
--- a/llvm/test/Transforms/LoopInterchange/pr48212.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr48212.ll
@@ -38,7 +38,7 @@ for.body3:                                        ; preds = %L2, %for.inc
   %idxprom4 = sext i32 %k1.03 to i64
   %arrayidx5 = getelementptr inbounds [5 x i32], ptr %arrayidx, i64 0, i64 %idxprom4
   %0 = load i32, ptr %arrayidx5
-  %add = add i32 %temp.12, %0
+  %add = add nsw i32 %temp.12, %0
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body3
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-kind.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
similarity index 79%
rename from llvm/test/Transforms/LoopInterchange/reductions-kind.ll
rename to llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
index d9e4d58a1780e..35ffd49666983 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-kind.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
@@ -2,185 +2,8 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
 ; RUN: FileCheck -input-file=%t %s
 
-; int sum = 0;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     sum += A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_add
-define void @reduction_add(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load i32, ptr %idx, align 4
-  %sum.j.next = add nsw i32 %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_wrap_add
-define void @reduction_wrap_add(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load i32, ptr %idx, align 4
-  %sum.j.next = add i32 %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_cast_add
-define void @reduction_cast_add(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load i32, ptr %idx, align 4
-  %sum.j.trunc = trunc i32 %sum.j to i16
-  %sum.j.ext = zext i16 %sum.j.trunc to i32
-  %sum.j.next = add nsw i32 %sum.j.ext, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-
-; int prod = 1;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     prod *= A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_mul
-define void @reduction_mul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load i32, ptr %idx, align 4
-  %prod.j.next = mul nsw i32 %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_wrap_mul
-define void @reduction_wrap_mul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load i32, ptr %idx, align 4
-  %prod.j.next = mul i32 %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-
+; Check that exchanging the loops is legal for the bitwise-or reduction.
+;
 ; int b_or = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -220,6 +43,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the bitwise-and reduction.
+;
 ; int b_and = -1;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -259,6 +84,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the bitwise-xor reduction.
+;
 ; int b_xor = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -298,6 +125,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the signed-minimum reduction.
+;
 ; int smin = init;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -338,6 +167,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the signed-maximum reduction.
+;
 ; int smax = init;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -378,6 +209,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the unsigned-minimum reduction.
+;
 ; unsigned umin = init;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -418,6 +251,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the unsigned-maximum reduction.
+;
 ; unsigned umax = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -458,6 +293,8 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the any-of reduction.
+;
 ; int any_of = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -497,6 +334,9 @@ exit:
   ret void
 }
 
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point addition.
+;
 ; float sum = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -535,6 +375,9 @@ exit:
   ret void
 }
 
+; Check that the interchange is legal if the floation-point addition is marked
+; as reassoc.
+;
 ; CHECK:      --- !Pass
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Interchanged
@@ -568,6 +411,9 @@ exit:
   ret void
 }
 
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point multiplication.
+;
 ; float prod = 1;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -606,6 +452,9 @@ exit:
   ret void
 }
 
+; Check that the interchange is legal if the floation-point multiplication is
+; marked as reassoc.
+;
 ; CHECK:      --- !Pass
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Interchanged
@@ -639,6 +488,9 @@ exit:
   ret void
 }
 
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point fmuladd.
+;
 ; float fmuladd = 0;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -679,6 +531,9 @@ exit:
   ret void
 }
 
+; Check that the interchange is legal if the floation-point fmuladd is marked
+; as reassoc.
+;
 ; CHECK:      --- !Pass
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Interchanged
@@ -714,6 +569,9 @@ exit:
   ret void
 }
 
+; Check that exchanging the loops is legal for the reassociative floating-point
+; minimum.
+;
 ; float fmin = init;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -754,6 +612,9 @@ exit:
 }
 
 
+; Check that exchanging the loops is legal for the floation-point
+; llvm.minimumnum.
+;
 ; CHECK:      --- !Pass
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Interchanged
@@ -787,6 +648,9 @@ exit:
   ret void
 }
 
+; Check that exchanging the loops is legal for the reassociative floating-point
+; maximum.
+;
 ; float fmax = init;
 ; for (int i = 0; i < 2; i++)
 ;   for (int j = 0; j < 2; j++)
@@ -826,6 +690,9 @@ exit:
   ret void
 }
 
+; Check that exchanging the loops is legal for the floation-point
+; llvm.maximumnum.
+
 ; CHECK:      --- !Pass
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Interchanged
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll b/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll
new file mode 100644
index 0000000000000..5c05f963a2f3e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -cache-line-size=64 -S < %s | FileCheck %s
+
+; Check that nsw/nuw flags are dropped when interchanging loops.
+;
+; int sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+;
+define void @reduction_add(ptr %A) {
+; CHECK-LABEL: define void @reduction_add(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], %[[FOR_I_LATCH:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_J:%.*]] = phi i32 [ [[SUM_J_NEXT:%.*]], %[[FOR_I_LATCH]] ], [ [[SUM_I:%.*]], %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_J_SPLIT1:.*]]
+; CHECK:       [[FOR_J_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J:.*]]
+; CHECK:       [[FOR_J]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[TMP0:%.*]], %[[FOR_J_SPLIT:.*]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_I]] = phi i32 [ [[SUM_I_LCSSA:%.*]], %[[FOR_J_SPLIT]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_J_SPLIT1]]:
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[A]], i32 0, i32 [[J]], i32 [[I]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[IDX]], align 4
+; CHECK-NEXT:    [[SUM_J_NEXT]] = add i32 [[SUM_J]], [[A]]
+; CHECK-NEXT:    [[J_INC:%.*]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], 2
+; CHECK-NEXT:    br label %[[FOR_I_LATCH]]
+; CHECK:       [[FOR_J_SPLIT]]:
+; CHECK-NEXT:    [[SUM_I_LCSSA]] = phi i32 [ [[SUM_J_NEXT]], %[[FOR_I_LATCH]] ]
+; CHECK-NEXT:    [[TMP0]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_J]], label %[[EXIT:.*]]
+; CHECK:       [[FOR_I_LATCH]]:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[I_INC]], 2
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FOR_I_HEADER]], label %[[FOR_J_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %sum.j.next = add nuw nsw i32 %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that nsw/nuw flags are dropped when interchanging loops.
+;
+; int prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+;
+define void @reduction_mul(ptr %A) {
+; CHECK-LABEL: define void @reduction_mul(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], %[[FOR_I_LATCH:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PROD_J:%.*]] = phi i32 [ [[PROD_J_NEXT:%.*]], %[[FOR_I_LATCH]] ], [ [[PROD_I:%.*]], %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_J_SPLIT1:.*]]
+; CHECK:       [[FOR_J_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J:.*]]
+; CHECK:       [[FOR_J]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[TMP0:%.*]], %[[FOR_J_SPLIT:.*]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    [[PROD_I]] = phi i32 [ [[PROD_I_LCSSA:%.*]], %[[FOR_J_SPLIT]] ], [ 1, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_J_SPLIT1]]:
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[A]], i32 0, i32 [[J]], i32 [[I]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[IDX]], align 4
+; CHECK-NEXT:    [[PROD_J_NEXT]] = mul i32 [[PROD_J]], [[A]]
+; CHECK-NEXT:    [[J_INC:%.*]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], 2
+; CHECK-NEXT:    br label %[[FOR_I_LATCH]]
+; CHECK:       [[FOR_J_SPLIT]]:
+; CHECK-NEXT:    [[PROD_I_LCSSA]] = phi i32 [ [[PROD_J_NEXT]], %[[FOR_I_LATCH]] ]
+; CHECK-NEXT:    [[TMP0]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_J]], label %[[EXIT:.*]]
+; CHECK:       [[FOR_I_LATCH]]:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[I_INC]], 2
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FOR_I_HEADER]], label %[[FOR_J_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %prod.j.next = mul nsw nuw i32 %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}

>From 00bc619fb0ea7a4ba75d385e1ab52c45de4b538f Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 15 Jul 2025 07:41:40 +0000
Subject: [PATCH 3/5] Address FIXME comments

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index ba3d60980da6f..61da144bdd270 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -863,13 +863,13 @@ findInnerReductionPhi(Loop *L, Value *V,
           unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
           SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
 
-          // FIXME: Is this check necessary?
+          // Bail out when we fail to collect reduction instructions chain.
           if (Ops.empty())
             return nullptr;
+
           for (Instruction *I : Ops) {
-            // FIXME: Is this check necessary?
-            if (I->getOpcode() != OpCode)
-              return nullptr;
+            assert(I->getOpcode() == OpCode &&
+                   "Expected the instruction to be the reduction operation");
 
             // If the instruction has nuw/nsw flags, we must drop them when the
             // transformation is actually performed.

>From ae7f398a0c1a2108455b77935b2bd6dc164ffa02 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 15 Jul 2025 09:23:54 +0000
Subject: [PATCH 4/5] Fix typo

---
 .../reductions-non-wrapped-operations.ll      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
index 35ffd49666983..3ed69485bc8f2 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
@@ -2,7 +2,7 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
 ; RUN: FileCheck -input-file=%t %s
 
-; Check that exchanging the loops is legal for the bitwise-or reduction.
+; Check that interchanging the loops is legal for the bitwise-or reduction.
 ;
 ; int b_or = 0;
 ; for (int i = 0; i < 2; i++)
@@ -43,7 +43,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the bitwise-and reduction.
+; Check that interchanging the loops is legal for the bitwise-and reduction.
 ;
 ; int b_and = -1;
 ; for (int i = 0; i < 2; i++)
@@ -84,7 +84,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the bitwise-xor reduction.
+; Check that interchanging the loops is legal for the bitwise-xor reduction.
 ;
 ; int b_xor = 0;
 ; for (int i = 0; i < 2; i++)
@@ -125,7 +125,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the signed-minimum reduction.
+; Check that interchanging the loops is legal for the signed-minimum reduction.
 ;
 ; int smin = init;
 ; for (int i = 0; i < 2; i++)
@@ -167,7 +167,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the signed-maximum reduction.
+; Check that interchanging the loops is legal for the signed-maximum reduction.
 ;
 ; int smax = init;
 ; for (int i = 0; i < 2; i++)
@@ -209,7 +209,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the unsigned-minimum reduction.
+; Check that interchanging the loops is legal for the unsigned-minimum reduction.
 ;
 ; unsigned umin = init;
 ; for (int i = 0; i < 2; i++)
@@ -251,7 +251,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the unsigned-maximum reduction.
+; Check that interchanging the loops is legal for the unsigned-maximum reduction.
 ;
 ; unsigned umax = 0;
 ; for (int i = 0; i < 2; i++)
@@ -293,7 +293,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the any-of reduction.
+; Check that interchanging the loops is legal for the any-of reduction.
 ;
 ; int any_of = 0;
 ; for (int i = 0; i < 2; i++)
@@ -375,7 +375,7 @@ exit:
   ret void
 }
 
-; Check that the interchange is legal if the floation-point addition is marked
+; Check that the interchange is legal if the floating-point addition is marked
 ; as reassoc.
 ;
 ; CHECK:      --- !Pass
@@ -452,7 +452,7 @@ exit:
   ret void
 }
 
-; Check that the interchange is legal if the floation-point multiplication is
+; Check that the interchange is legal if the floating-point multiplication is
 ; marked as reassoc.
 ;
 ; CHECK:      --- !Pass
@@ -531,7 +531,7 @@ exit:
   ret void
 }
 
-; Check that the interchange is legal if the floation-point fmuladd is marked
+; Check that the interchange is legal if the floating-point fmuladd is marked
 ; as reassoc.
 ;
 ; CHECK:      --- !Pass
@@ -569,8 +569,8 @@ exit:
   ret void
 }
 
-; Check that exchanging the loops is legal for the reassociative floating-point
-; minimum.
+; Check that interchanging the loops is legal for the reassociative
+; floating-point minimum.
 ;
 ; float fmin = init;
 ; for (int i = 0; i < 2; i++)
@@ -612,7 +612,7 @@ exit:
 }
 
 
-; Check that exchanging the loops is legal for the floation-point
+; Check that interchanging the loops is legal for the floating-point
 ; llvm.minimumnum.
 ;
 ; CHECK:      --- !Pass
@@ -648,8 +648,8 @@ exit:
   ret void
 }
 
-; Check that exchanging the loops is legal for the reassociative floating-point
-; maximum.
+; Check that interchanging the loops is legal for the reassociative
+; floating-point maximum.
 ;
 ; float fmax = init;
 ; for (int i = 0; i < 2; i++)
@@ -690,7 +690,7 @@ exit:
   ret void
 }
 
-; Check that exchanging the loops is legal for the floation-point
+; Check that interchanging the loops is legal for the floating-point
 ; llvm.maximumnum.
 
 ; CHECK:      --- !Pass

>From d8bbcc3179fc0e5dcf26e8eac9eda26e2b500434 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 15 Jul 2025 21:21:48 +0900
Subject: [PATCH 5/5] exchanging -> interchanging

---
 llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 61da144bdd270..09ebd2c913c0e 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -412,7 +412,7 @@ class LoopInterchangeLegality {
 
   /// Hold instructions that have nuw/nsw flags and involved in reductions,
   /// like integer addition/multiplication. Those flags must be dropped when
-  /// exchanging the loops.
+  /// interchanging the loops.
   SmallVector<Instruction *, 4> HasNoWrapReductions;
 };