[llvm] 9e516f5 - [MachinePipeliner] Remove isLoopCarriedDep and use DDG (#174394)

Fri Apr 3 03:36:40 PDT 2026

Author: Ryotaro Kasuga
Date: 2026-04-03T10:36:34Z
New Revision: 9e516f5c583a4f5beabe5591018306a2d1120235

URL: https://github.com/llvm/llvm-project/commit/9e516f5c583a4f5beabe5591018306a2d1120235
DIFF: https://github.com/llvm/llvm-project/commit/9e516f5c583a4f5beabe5591018306a2d1120235.diff

LOG: [MachinePipeliner] Remove isLoopCarriedDep and use DDG (#174394)

This patch completely removes `isLoopCarriedDep`, which was used
previously to identify loop-carried dependencies in the DAG. Now that we
have the DDG representation, this special handling is no longer
necessary. Simply replacing its usage with the DDG causes several tests
to fail, since cycle detection takes some of the validation-only edges
in the DDG into account. To address this, this patch introduces extra
edges in the DDG, which are used only for cycle detection and not for
other parts of the pass (e.g., scheduling). The extra edges are
determined to preserve the existing behavior of the pass as closely as
possible, which makes the predicates for adding them somewhat complex.

Split off from #135148, and the final patch in the series for #135148

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/MachinePipeliner.h
    llvm/lib/CodeGen/MachinePipeliner.cpp

Removed: 
    llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index cf7901ad697de..a755589fe901b 100644

--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -233,6 +233,14 @@ class SwingSchedulerDDG {
   struct SwingSchedulerDDGEdges {
     EdgesType Preds;
     EdgesType Succs;
+
+    /// This field is a subset of ValidationOnlyEdges. These edges are used only
+    /// by specific heuristics, mainly for cycle detection. Although they are
+    /// unnecessary in theory (i.e., ignoring them should still yield a valid
+    /// schedule), they are retained to preserve the existing behavior. Since we
+    /// only need which extra edges exist from a given SUnit, we only store the
+    /// destination SUnits.
+    SmallVector<SUnit *, 4> ExtraSuccs;
   };
 
   void initEdges(SUnit *SU);
@@ -263,6 +271,8 @@ class SwingSchedulerDDG {
 
   const EdgesType &getOutEdges(const SUnit *SU) const;
 
+  ArrayRef<SUnit *> getExtraOutEdges(const SUnit *SU) const;
+
   bool isValidSchedule(const SMSchedule &Schedule) const;
 };
 
@@ -358,7 +368,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
       NumPaths = 0;
     }
 
-    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
+    void createAdjacencyStructure(SwingSchedulerDDG *DDG);
     bool circuit(int V, int S, NodeSetType &NodeSets,
                  const SwingSchedulerDAG *DAG, bool HasBackedge = false);
     void unblock(int U);
@@ -415,8 +425,6 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
   }
 
-  bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const;
-
   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
 
   void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
@@ -527,13 +535,11 @@ class NodeSet {
     SUnit *FirstNode = Nodes[0];
     SUnit *LastNode = Nodes[Nodes.size() - 1];
 
-    for (auto &PI : DDG->getInEdges(LastNode)) {
+    for (SUnit *SU : DDG->getExtraOutEdges(LastNode)) {
       // If we have an order dep that is potentially loop carried then a
-      // back-edge exists between the last node and the first node that isn't
-      // modeled in the DAG. Handle it manually by adding 1 to the distance of
-      // the last node.
-      if (PI.getSrc() != FirstNode || !PI.isOrderDep() ||
-          !DAG->isLoopCarriedDep(PI))
+      // back-edge exists between the last node and the first node in extra
+      // edges. Handle it manually by adding 1 to the distance of the last node.
+      if (SU != FirstNode)
         continue;
       unsigned &First = SUnitToDistance[FirstNode];
       unsigned Last = SUnitToDistance[LastNode];

diff  --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 43bb19bf4f181..bfd4fa0c589af 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1969,13 +1969,13 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
 
 /// Create the adjacency structure of the nodes in the graph.
 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
-    SwingSchedulerDAG *DAG) {
+    SwingSchedulerDDG *DDG) {
   BitVector Added(SUnits.size());
   DenseMap<int, int> OutputDeps;
   for (int i = 0, e = SUnits.size(); i != e; ++i) {
     Added.reset();
     // Add any successor to the adjacency matrix and exclude duplicates.
-    for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
+    for (auto &OE : DDG->getOutEdges(&SUnits[i])) {
       // Only create a back-edge on the first and last nodes of a dependence
       // chain. This records any chains and adds them later.
       if (OE.isOutputDep()) {
@@ -2007,22 +2007,17 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
         Added.set(N);
       }
     }
-    // A chain edge between a store and a load is treated as a back-edge in the
-    // adjacency matrix.
-    for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
-      SUnit *Src = IE.getSrc();
-      SUnit *Dst = IE.getDst();
-      if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
-        continue;
-      if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
-        int N = Src->NodeNum;
-        if (!Added.test(N)) {
-          AdjK[i].push_back(N);
-          Added.set(N);
-        }
+
+    // Also add any extra out edges to the adjacency matrix.
+    for (const SUnit *Dst : DDG->getExtraOutEdges(&SUnits[i])) {
+      int N = Dst->NodeNum;
+      if (!Added.test(N)) {
+        AdjK[i].push_back(N);
+        Added.set(N);
       }
     }
   }
+
   // Add back-edges in the adjacency matrix for the output dependences.
   for (auto &OD : OutputDeps)
     if (!Added.test(OD.second)) {
@@ -2092,7 +2087,7 @@ void SwingSchedulerDAG::Circuits::unblock(int U) {
 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   Circuits Cir(SUnits, Topo);
   // Create the adjacency structure.
-  Cir.createAdjacencyStructure(this);
+  Cir.createAdjacencyStructure(&*DDG);
   for (int I = 0, E = SUnits.size(); I != E; ++I) {
     Cir.reset();
     Cir.circuit(I, I, NodeSets, this);
@@ -3235,40 +3230,6 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
   return true;
 }
 
-/// Return true for an order or output dependence that is loop carried
-/// potentially. A dependence is loop carried if the destination defines a value
-/// that may be used or defined by the source in a subsequent iteration.
-bool SwingSchedulerDAG::isLoopCarriedDep(
-    const SwingSchedulerDDGEdge &Edge) const {
-  if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
-      Edge.getDst()->isBoundaryNode())
-    return false;
-
-  if (!SwpPruneLoopCarried)
-    return true;
-
-  if (Edge.isOutputDep())
-    return true;
-
-  MachineInstr *SI = Edge.getSrc()->getInstr();
-  MachineInstr *DI = Edge.getDst()->getInstr();
-  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
-
-  // Assume ordered loads and stores may have a loop carried dependence.
-  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
-      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
-      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
-    return true;
-
-  if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
-    return false;
-
-  // The conservative assumption is that a dependence between memory operations
-  // may be loop carried. The following code checks when it can be proved that
-  // there is no loop carried dependence.
-  return mayOverlapInLaterIter(DI, SI);
-}
-
 void SwingSchedulerDAG::postProcessDAG() {
   for (auto &M : Mutations)
     M->apply(this);
@@ -4253,6 +4214,7 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
                                 const SwingSchedulerDDGEdge &Edge) {
   assert(!Edge.isValidationOnly() &&
          "Validation-only edges are not expected here.");
+
   auto &Edges = getEdges(SU);
   if (Edge.getSrc() == SU)
     Edges.Succs.push_back(Edge);
@@ -4296,6 +4258,32 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
                                    /*IsValidationOnly=*/true);
         Edge.setDistance(1);
         ValidationOnlyEdges.push_back(Edge);
+
+        // Store the edge as an extra edge if it meets the following conditions:
+        //
+        //  - The edge is a loop-carried order dependency.
+        //  - The edge is a back edge in terms of the original instruction
+        //    order.
+        //  - The destination instruction may load.
+        //  - The source instruction may store but does not load.
+        //
+        // These conditions are inherited from a previous implementation to
+        // preserve the existing behavior and avoid regressions.
+        bool UseAsExtraEdge = [&]() {
+          if (Edge.getDistance() == 0 || !Edge.isOrderDep())
+            return false;
+
+          SUnit *Src = Edge.getSrc();
+          SUnit *Dst = Edge.getDst();
+          if (Src->NodeNum < Dst->NodeNum)
+            return false;
+
+          MachineInstr *SrcMI = Src->getInstr();
+          MachineInstr *DstMI = Dst->getInstr();
+          return DstMI->mayLoad() && !SrcMI->mayLoad() && SrcMI->mayStore();
+        }();
+        if (UseAsExtraEdge)
+          getEdges(Edge.getSrc()).ExtraSuccs.push_back(Edge.getDst());
       }
     }
   }
@@ -4311,6 +4299,10 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
   return getEdges(SU).Succs;
 }
 
+ArrayRef<SUnit *> SwingSchedulerDDG::getExtraOutEdges(const SUnit *SU) const {
+  return getEdges(SU).ExtraSuccs;
+}
+
 /// Check if \p Schedule doesn't violate the validation-only dependencies.
 bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {
   unsigned II = Schedule.getInitiationInterval();

diff  --git a/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir b/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
deleted file mode 100644
index 3a984f6488700..0000000000000
--- a/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
+++ /dev/null
@@ -1,335 +0,0 @@
-# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner -pipeliner-max-stages=50 -pipeliner-max-mii=50 -pipeliner-enable-copytophi=0 -pipeliner-ii-search-range=30 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
-# Test that each instruction must be scheduled between the early cycle and the late cycle. Previously there were cases where an instruction is scheduled outside of the valid range. See issue #93936 for details.
-
-# CHECK: {{^ *}}Try to schedule with 47
-# CHECK: {{^ *}}Inst (11)   %48:fpr128 = LDRQui %35:gpr64sp, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !0)
-# CHECK-EMPTY:
-# CHECK-NEXT: {{^ *}}es: ffffffe8 ls: ffffffe9
-# CHECK-NEXT: {{^ *}}Trying to insert node between -24 and -23 II: 47
-# CHECK-NEXT: {{^ *}}insert at cycle -24   %48:fpr128 = LDRQui %35:gpr64sp, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !0)
-
---- |
-  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define dso_local void @f(ptr nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef readonly %c, ptr nocapture noundef readonly %d, ptr nocapture noundef readonly %e, float noundef %f, i32 noundef %N) local_unnamed_addr {
-  entry:
-    %cmp16 = icmp sgt i32 %N, 0
-    br i1 %cmp16, label %for.body.preheader, label %for.cond.cleanup
-  
-  for.body.preheader:                               ; preds = %entry
-    %wide.trip.count = zext nneg i32 %N to i64
-    %min.iters.check = icmp ult i32 %N, 8
-    br i1 %min.iters.check, label %for.body.preheader37, label %vector.memcheck
-  
-  vector.memcheck:                                  ; preds = %for.body.preheader
-    %0 = ptrtoint ptr %a to i64
-    %1 = ptrtoint ptr %b to i64
-    %2 = ptrtoint ptr %c to i64
-    %3 = ptrtoint ptr %d to i64
-    %4 = ptrtoint ptr %e to i64
-    %5 = sub i64 %0, %1
-    %
diff .check = icmp ult i64 %5, 32
-    %6 = sub i64 %0, %2
-    %
diff .check22 = icmp ult i64 %6, 32
-    %conflict.rdx = or i1 %
diff .check, %
diff .check22
-    %7 = sub i64 %0, %3
-    %
diff .check24 = icmp ult i64 %7, 32
-    %conflict.rdx25 = or i1 %conflict.rdx, %
diff .check24
-    %8 = sub i64 %0, %4
-    %
diff .check27 = icmp ult i64 %8, 32
-    %conflict.rdx28 = or i1 %conflict.rdx25, %
diff .check27
-    br i1 %conflict.rdx28, label %for.body.preheader37, label %vector.ph
-  
-  vector.ph:                                        ; preds = %vector.memcheck
-    %n.vec = and i64 %wide.trip.count, 2147483640
-    %broadcast.splatinsert = insertelement <4 x float> poison, float %f, i64 0
-    %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
-    %scevgep54 = getelementptr i8, ptr %b, i64 16
-    %scevgep58 = getelementptr i8, ptr %a, i64 16
-    %scevgep62 = getelementptr i8, ptr %c, i64 16
-    %scevgep66 = getelementptr i8, ptr %e, i64 16
-    %scevgep70 = getelementptr i8, ptr %d, i64 16
-    br label %vector.body
-  
-  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv71 = phi ptr [ %scevgep72, %vector.body ], [ %scevgep70, %vector.ph ]
-    %lsr.iv67 = phi ptr [ %scevgep68, %vector.body ], [ %scevgep66, %vector.ph ]
-    %lsr.iv63 = phi ptr [ %scevgep64, %vector.body ], [ %scevgep62, %vector.ph ]
-    %lsr.iv59 = phi ptr [ %scevgep60, %vector.body ], [ %scevgep58, %vector.ph ]
-    %lsr.iv55 = phi ptr [ %scevgep56, %vector.body ], [ %scevgep54, %vector.ph ]
-    %lsr.iv52 = phi i64 [ %lsr.iv.next53, %vector.body ], [ %n.vec, %vector.ph ]
-    %scevgep57 = getelementptr i8, ptr %lsr.iv55, i64 -16
-    %wide.load = load <4 x float>, ptr %scevgep57, align 4, !tbaa !6
-    %wide.load29 = load <4 x float>, ptr %lsr.iv55, align 4, !tbaa !6
-    %9 = fmul <4 x float> %wide.load, %broadcast.splat
-    %10 = fmul <4 x float> %wide.load29, %broadcast.splat
-    %scevgep65 = getelementptr i8, ptr %lsr.iv63, i64 -16
-    %wide.load30 = load <4 x float>, ptr %scevgep65, align 4, !tbaa !6
-    %wide.load31 = load <4 x float>, ptr %lsr.iv63, align 4, !tbaa !6
-    %scevgep73 = getelementptr i8, ptr %lsr.iv71, i64 -16
-    %wide.load32 = load <4 x float>, ptr %scevgep73, align 4, !tbaa !6
-    %wide.load33 = load <4 x float>, ptr %lsr.iv71, align 4, !tbaa !6
-    %11 = fsub <4 x float> %wide.load30, %wide.load32
-    %12 = fsub <4 x float> %wide.load31, %wide.load33
-    %13 = fmul <4 x float> %9, %11
-    %14 = fmul <4 x float> %10, %12
-    %scevgep69 = getelementptr i8, ptr %lsr.iv67, i64 -16
-    %wide.load34 = load <4 x float>, ptr %scevgep69, align 4, !tbaa !6
-    %wide.load35 = load <4 x float>, ptr %lsr.iv67, align 4, !tbaa !6
-    %15 = fdiv <4 x float> %13, %wide.load34
-    %16 = fdiv <4 x float> %14, %wide.load35
-    %scevgep61 = getelementptr i8, ptr %lsr.iv59, i64 -16
-    store <4 x float> %15, ptr %scevgep61, align 4, !tbaa !6
-    store <4 x float> %16, ptr %lsr.iv59, align 4, !tbaa !6
-    %lsr.iv.next53 = add nsw i64 %lsr.iv52, -8
-    %scevgep56 = getelementptr i8, ptr %lsr.iv55, i64 32
-    %scevgep60 = getelementptr i8, ptr %lsr.iv59, i64 32
-    %scevgep64 = getelementptr i8, ptr %lsr.iv63, i64 32
-    %scevgep68 = getelementptr i8, ptr %lsr.iv67, i64 32
-    %scevgep72 = getelementptr i8, ptr %lsr.iv71, i64 32
-    %17 = icmp eq i64 %lsr.iv.next53, 0
-    br i1 %17, label %middle.block, label %vector.body, !llvm.loop !10
-  
-  middle.block:                                     ; preds = %vector.body
-    %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
-    br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader37
-  
-  for.body.preheader37:                             ; preds = %vector.memcheck, %for.body.preheader, %middle.block
-    %indvars.iv.ph = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ], [ 0, %vector.memcheck ]
-    %18 = shl nuw nsw i64 %indvars.iv.ph, 2
-    %scevgep = getelementptr i8, ptr %a, i64 %18
-    %scevgep39 = getelementptr i8, ptr %e, i64 %18
-    %scevgep42 = getelementptr i8, ptr %d, i64 %18
-    %scevgep45 = getelementptr i8, ptr %c, i64 %18
-    %scevgep48 = getelementptr i8, ptr %b, i64 %18
-    %19 = sub i64 %wide.trip.count, %indvars.iv.ph
-    br label %for.body
-  
-  for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
-    ret void
-  
-  for.body:                                         ; preds = %for.body.preheader37, %for.body
-    %lsr.iv51 = phi i64 [ %19, %for.body.preheader37 ], [ %lsr.iv.next, %for.body ]
-    %lsr.iv49 = phi ptr [ %scevgep48, %for.body.preheader37 ], [ %scevgep50, %for.body ]
-    %lsr.iv46 = phi ptr [ %scevgep45, %for.body.preheader37 ], [ %scevgep47, %for.body ]
-    %lsr.iv43 = phi ptr [ %scevgep42, %for.body.preheader37 ], [ %scevgep44, %for.body ]
-    %lsr.iv40 = phi ptr [ %scevgep39, %for.body.preheader37 ], [ %scevgep41, %for.body ]
-    %lsr.iv = phi ptr [ %scevgep, %for.body.preheader37 ], [ %scevgep38, %for.body ]
-    %20 = load float, ptr %lsr.iv49, align 4, !tbaa !6
-    %mul = fmul float %20, %f
-    %21 = load float, ptr %lsr.iv46, align 4, !tbaa !6
-    %22 = load float, ptr %lsr.iv43, align 4, !tbaa !6
-    %sub = fsub float %21, %22
-    %mul5 = fmul float %mul, %sub
-    %23 = load float, ptr %lsr.iv40, align 4, !tbaa !6
-    %div = fdiv float %mul5, %23
-    store float %div, ptr %lsr.iv, align 4, !tbaa !6
-    %scevgep38 = getelementptr i8, ptr %lsr.iv, i64 4
-    %scevgep41 = getelementptr i8, ptr %lsr.iv40, i64 4
-    %scevgep44 = getelementptr i8, ptr %lsr.iv43, i64 4
-    %scevgep47 = getelementptr i8, ptr %lsr.iv46, i64 4
-    %scevgep50 = getelementptr i8, ptr %lsr.iv49, i64 4
-    %lsr.iv.next = add i64 %lsr.iv51, -1
-    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
-    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-  }
-  
-  !6 = !{!7, !7, i64 0}
-  !7 = !{!"float", !8, i64 0}
-  !8 = !{!"omnipotent char", !9, i64 0}
-  !9 = !{!"Simple C/C++ TBAA"}
-  !10 = distinct !{!10, !11, !12, !13}
-  !11 = !{!"llvm.loop.mustprogress"}
-  !12 = !{!"llvm.loop.isvectorized", i32 1}
-  !13 = !{!"llvm.loop.unroll.runtime.disable"}
-  !14 = distinct !{!14, !11, !12}
-
-...
----
-name:            f
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0', virtual-reg: '%39' }
-  - { reg: '$x1', virtual-reg: '%40' }
-  - { reg: '$x2', virtual-reg: '%41' }
-  - { reg: '$x3', virtual-reg: '%42' }
-  - { reg: '$x4', virtual-reg: '%43' }
-  - { reg: '$s0', virtual-reg: '%44' }
-  - { reg: '$w5', virtual-reg: '%45' }
-body:             |
-  bb.0.entry:
-    successors: %bb.1, %bb.7
-    liveins: $x0, $x1, $x2, $x3, $x4, $s0, $w5
-  
-    %45:gpr32common = COPY $w5
-    %44:fpr32 = COPY $s0
-    %43:gpr64common = COPY $x4
-    %42:gpr64common = COPY $x3
-    %41:gpr64common = COPY $x2
-    %40:gpr64common = COPY $x1
-    %39:gpr64common = COPY $x0
-    dead $wzr = SUBSWri %45, 1, 0, implicit-def $nzcv
-    Bcc 11, %bb.7, implicit $nzcv
-    B %bb.1
-  
-  bb.1.for.body.preheader:
-    successors: %bb.12, %bb.2
-  
-    %48:gpr32 = ORRWrs $wzr, %45, 0
-    %0:gpr64 = SUBREG_TO_REG killed %48, %subreg.sub_32
-    dead $wzr = SUBSWri %45, 8, 0, implicit-def $nzcv
-    Bcc 2, %bb.2, implicit $nzcv
-  
-  bb.12:
-    %49:gpr64all = COPY $xzr
-    %47:gpr64all = COPY %49
-    B %bb.6
-  
-  bb.2.vector.memcheck:
-    successors: %bb.6, %bb.11
-  
-    %55:gpr64common = SUBXrr %39, %40
-    %59:gpr64all = COPY $xzr
-    %51:gpr64all = COPY %59
-    dead $xzr = SUBSXri killed %55, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.11
-  
-  bb.11.vector.memcheck:
-    successors: %bb.6, %bb.10
-  
-    %56:gpr64common = SUBXrr %39, %41
-    dead $xzr = SUBSXri %56, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.10
-  
-  bb.10.vector.memcheck:
-    successors: %bb.6, %bb.9
-  
-    %57:gpr64common = SUBXrr %39, %42
-    dead $xzr = SUBSXri %57, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.9
-  
-  bb.9.vector.memcheck:
-    successors: %bb.6, %bb.3
-  
-    %58:gpr64common = SUBXrr %39, %43
-    dead $xzr = SUBSXri %58, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.3
-  
-  bb.3.vector.ph:
-    %64:gpr64common = ANDXri %0, 8027
-    %1:gpr64 = COPY %64
-    %66:fpr128 = IMPLICIT_DEF
-    %65:fpr128 = INSERT_SUBREG %66, %44, %subreg.ssub
-    %67:gpr64sp = ADDXri %40, 16, 0
-    %3:gpr64all = COPY %67
-    %68:gpr64sp = ADDXri %39, 16, 0
-    %4:gpr64all = COPY %68
-    %69:gpr64sp = ADDXri %41, 16, 0
-    %5:gpr64all = COPY %69
-    %70:gpr64sp = ADDXri %43, 16, 0
-    %6:gpr64all = COPY %70
-    %71:gpr64sp = ADDXri %42, 16, 0
-    %7:gpr64all = COPY %71
-  
-  bb.4.vector.body:
-    successors: %bb.5, %bb.4
-  
-    %8:gpr64sp = PHI %7, %bb.3, %19, %bb.4
-    %9:gpr64sp = PHI %6, %bb.3, %18, %bb.4
-    %10:gpr64sp = PHI %5, %bb.3, %17, %bb.4
-    %11:gpr64sp = PHI %4, %bb.3, %16, %bb.4
-    %12:gpr64sp = PHI %3, %bb.3, %15, %bb.4
-    %13:gpr64sp = PHI %1, %bb.3, %14, %bb.4
-    %72:fpr128 = LDURQi %12, -16 :: (load (s128) from %ir.scevgep57, align 4, !tbaa !6)
-    %73:fpr128 = LDRQui %12, 0 :: (load (s128) from %ir.lsr.iv55, align 4, !tbaa !6)
-    %74:fpr128 = nofpexcept FMULv4i32_indexed killed %72, %65, 0, implicit $fpcr
-    %75:fpr128 = nofpexcept FMULv4i32_indexed killed %73, %65, 0, implicit $fpcr
-    %76:fpr128 = LDURQi %10, -16 :: (load (s128) from %ir.scevgep65, align 4, !tbaa !6)
-    %77:fpr128 = LDRQui %10, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !6)
-    %78:fpr128 = LDURQi %8, -16 :: (load (s128) from %ir.scevgep73, align 4, !tbaa !6)
-    %79:fpr128 = LDRQui %8, 0 :: (load (s128) from %ir.lsr.iv71, align 4, !tbaa !6)
-    %80:fpr128 = nofpexcept FSUBv4f32 killed %76, killed %78, implicit $fpcr
-    %81:fpr128 = nofpexcept FSUBv4f32 killed %77, killed %79, implicit $fpcr
-    %82:fpr128 = nofpexcept FMULv4f32 killed %74, killed %80, implicit $fpcr
-    %83:fpr128 = nofpexcept FMULv4f32 killed %75, killed %81, implicit $fpcr
-    %84:fpr128 = LDURQi %9, -16 :: (load (s128) from %ir.scevgep69, align 4, !tbaa !6)
-    %85:fpr128 = LDRQui %9, 0 :: (load (s128) from %ir.lsr.iv67, align 4, !tbaa !6)
-    %86:fpr128 = nofpexcept FDIVv4f32 killed %82, killed %84, implicit $fpcr
-    %87:fpr128 = nofpexcept FDIVv4f32 killed %83, killed %85, implicit $fpcr
-    STURQi killed %86, %11, -16 :: (store (s128) into %ir.scevgep61, align 4, !tbaa !6)
-    STRQui killed %87, %11, 0 :: (store (s128) into %ir.lsr.iv59, align 4, !tbaa !6)
-    %88:gpr64 = nsw SUBSXri %13, 8, 0, implicit-def $nzcv
-    %14:gpr64all = COPY %88
-    %89:gpr64sp = ADDXri %12, 32, 0
-    %15:gpr64all = COPY %89
-    %90:gpr64sp = ADDXri %11, 32, 0
-    %16:gpr64all = COPY %90
-    %91:gpr64sp = ADDXri %10, 32, 0
-    %17:gpr64all = COPY %91
-    %92:gpr64sp = ADDXri %9, 32, 0
-    %18:gpr64all = COPY %92
-    %93:gpr64sp = ADDXri %8, 32, 0
-    %19:gpr64all = COPY %93
-    Bcc 1, %bb.4, implicit $nzcv
-    B %bb.5
-  
-  bb.5.middle.block:
-    dead $xzr = SUBSXrr %64, %0, implicit-def $nzcv
-    Bcc 0, %bb.7, implicit $nzcv
-    B %bb.6
-  
-  bb.6.for.body.preheader37:
-    %20:gpr64 = PHI %47, %bb.12, %51, %bb.2, %51, %bb.11, %51, %bb.10, %51, %bb.9, %1, %bb.5
-    %95:gpr64 = nuw nsw UBFMXri %20, 62, 61
-    %96:gpr64 = ADDXrr %39, %95
-    %21:gpr64all = COPY %96
-    %97:gpr64 = ADDXrr %43, %95
-    %22:gpr64all = COPY %97
-    %98:gpr64 = ADDXrr %42, %95
-    %23:gpr64all = COPY %98
-    %99:gpr64 = ADDXrr %41, %95
-    %24:gpr64all = COPY %99
-    %100:gpr64 = ADDXrr %40, %95
-    %25:gpr64all = COPY %100
-    %101:gpr64 = SUBXrr %0, %20
-    %26:gpr64all = COPY %101
-    B %bb.8
-  
-  bb.7.for.cond.cleanup:
-    RET_ReallyLR
-  
-  bb.8.for.body:
-    successors: %bb.7, %bb.8
-  
-    %27:gpr64sp = PHI %26, %bb.6, %38, %bb.8
-    %28:gpr64sp = PHI %25, %bb.6, %37, %bb.8
-    %29:gpr64sp = PHI %24, %bb.6, %36, %bb.8
-    %30:gpr64sp = PHI %23, %bb.6, %35, %bb.8
-    %31:gpr64sp = PHI %22, %bb.6, %34, %bb.8
-    %32:gpr64sp = PHI %21, %bb.6, %33, %bb.8
-    early-clobber %102:gpr64sp, %103:fpr32 = LDRSpost %28, 4 :: (load (s32) from %ir.lsr.iv49, !tbaa !6)
-    %104:fpr32 = nofpexcept FMULSrr killed %103, %44, implicit $fpcr
-    early-clobber %105:gpr64sp, %106:fpr32 = LDRSpost %29, 4 :: (load (s32) from %ir.lsr.iv46, !tbaa !6)
-    early-clobber %107:gpr64sp, %108:fpr32 = LDRSpost %30, 4 :: (load (s32) from %ir.lsr.iv43, !tbaa !6)
-    %109:fpr32 = nofpexcept FSUBSrr killed %106, killed %108, implicit $fpcr
-    %110:fpr32 = nofpexcept FMULSrr killed %104, killed %109, implicit $fpcr
-    early-clobber %111:gpr64sp, %112:fpr32 = LDRSpost %31, 4 :: (load (s32) from %ir.lsr.iv40, !tbaa !6)
-    %113:fpr32 = nofpexcept FDIVSrr killed %110, killed %112, implicit $fpcr
-    early-clobber %114:gpr64sp = STRSpost killed %113, %32, 4 :: (store (s32) into %ir.lsr.iv, !tbaa !6)
-    %33:gpr64all = COPY %114
-    %34:gpr64all = COPY %111
-    %35:gpr64all = COPY %107
-    %36:gpr64all = COPY %105
-    %37:gpr64all = COPY %102
-    %115:gpr64 = SUBSXri %27, 1, 0, implicit-def $nzcv
-    %38:gpr64all = COPY %115
-    Bcc 0, %bb.7, implicit $nzcv
-    B %bb.8
-
-...