[llvm] 2b1c6df - [Hexagon] Performance regression with b2b

Tue Jan 4 14:13:55 PST 2022

Author: Harsha Jagasia
Date: 2022-01-04T14:09:47-08:00
New Revision: 2b1c6df5a60ab7846974676586b0e3801e919772

URL: https://github.com/llvm/llvm-project/commit/2b1c6df5a60ab7846974676586b0e3801e919772
DIFF: https://github.com/llvm/llvm-project/commit/2b1c6df5a60ab7846974676586b0e3801e919772.diff

LOG: [Hexagon] Performance regression with b2b

For code below:
        {
                r7 = addasl(r3,r0,#2)
                r8 = addasl(r3,r2,#2)
                r5 = memw(r3+r0<<#2)
                r6 = memw(r3+r2<<#2)
        }
        {
                p1 = cmp.gtu(r6,r5)
                if (p1.new) memw(r8+#0) = r5
                if (p1.new) memw(r7+#0) = r6
        }
        {
                r0 = mux(p1,r2,r4)

        }

In packetizer, a new packet is created for the cmp instruction since
there arent enough resources in previous packet. Also it is determined
that the cmp stalls by 2 cycles since it depends on the prior load of r5.
In current packetizer implementation, the predicated store is evaluated
for whether it can go in the same packet as compare, and since the compare
stalls, the stall of the predicated store does not matter and it can go in
the same packet as the cmp. However the predicated store will stall for
more cycles because of its dependence on the addasl instruction and to
avoid that stall we can put it in a new packet.

Improve the packetizer to check if an instruction being added to packet
will stall longer than instruction already in packet and if so create a
new packet.

Added: 
    llvm/test/CodeGen/Hexagon/nbench1.ll

Modified: 
    llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
    llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 85ec0cdcd8f0..0f736a189245 100644

--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -1696,9 +1696,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
   MachineBasicBlock::iterator MII = MI.getIterator();
   MachineBasicBlock *MBB = MI.getParent();
 
-  if (CurrentPacketMIs.empty())
+  if (CurrentPacketMIs.empty()) {
     PacketStalls = false;
+    PacketStallCycles = 0;
+  }
   PacketStalls |= producesStall(MI);
+  PacketStallCycles = std::max(PacketStallCycles, calcStall(MI));
 
   if (MI.isImplicitDef()) {
     // Add to the packet to allow subsequent instructions to be checked
@@ -1878,12 +1881,7 @@ bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store(
 }
 
 // V60 forward scheduling.
-bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
-  // If the packet already stalls, then ignore the stall from a subsequent
-  // instruction in the same packet.
-  if (PacketStalls)
-    return false;
-
+unsigned int HexagonPacketizerList::calcStall(const MachineInstr &I) {
   // Check whether the previous packet is in a 
diff erent loop. If this is the
   // case, there is little point in trying to avoid a stall because that would
   // favor the rare case (loop entry) over the common case (loop iteration).
@@ -1895,10 +1893,12 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     auto *OldBB = OldPacketMIs.front()->getParent();
     auto *ThisBB = I.getParent();
     if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
-      return false;
+      return 0;
   }
 
   SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
+  if (!SUI)
+    return 0;
 
   // If the latency is 0 and there is a data dependence between this
   // instruction and any instruction in the current packet, we disregard any
@@ -1927,7 +1927,7 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
       if (Pred.getSUnit() == SUJ)
         if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
             HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
-          return false;
+          return 0;
   }
 
   // Check if the latency is greater than one between this instruction and any
@@ -1936,10 +1936,20 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     SUnit *SUJ = MIToSUnit[J];
     for (auto &Pred : SUI->Preds)
       if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
-        return true;
+        return Pred.getLatency();
   }
 
-  return false;
+  return 0;
+}
+
+bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+  unsigned int Latency = calcStall(I);
+  if (Latency == 0)
+    return false;
+  // Ignore stall unless it stalls more than previous instruction in packet
+  if (PacketStalls)
+    return Latency > PacketStallCycles;
+  return true;
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 27a47220570a..5d1b6d6faa12 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -56,6 +56,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // Set to true if the packet contains an instruction that stalls with an
   // instruction from the previous packet.
   bool PacketStalls = false;
+  // Set to the number of cycles of stall a given instruction will incur
+  // because of dependence on instruction in previous packet.
+  unsigned int PacketStallCycles = 0;
 
   // Set to true if the packet has a duplex pair of sub-instructions.
   bool PacketHasDuplex = false;
@@ -157,6 +160,7 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
   bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI);
+  unsigned int calcStall(const MachineInstr &MI);
 };
 
 } // end namespace llvm

diff  --git a/llvm/test/CodeGen/Hexagon/nbench1.ll b/llvm/test/CodeGen/Hexagon/nbench1.ll
new file mode 100644
index 000000000000..8300a9ab89ca
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/nbench1.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
+
+; if instruction being considered for addition to packet has higher latency,
+; end existing packet and start a new one.
+
+; CHECK: .LBB0_4:
+; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NEXT: }
+
+ at array = external dso_local local_unnamed_addr global i32*, align 4
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local void @NumSift(i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %add36 = shl i32 %i, 1
+  %cmp.not37 = icmp ugt i32 %add36, %j
+  br i1 %cmp.not37, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %0 = load i32*, i32** @array, align 4
+  %add16 = add i32 %j, 1
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %if.end17
+  %add39 = phi i32 [ %add36, %while.body.lr.ph ], [ %add, %if.end17 ]
+  %i.addr.038 = phi i32 [ %i, %while.body.lr.ph ], [ %i.addr.1, %if.end17 ]
+  %cmp2 = icmp ult i32 %add39, %j
+  br i1 %cmp2, label %if.then, label %if.end7
+
+if.then:                                          ; preds = %while.body
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 %add39
+  %1 = load i32, i32* %arrayidx, align 4
+  %add3 = or i32 %add39, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %0, i32 %add3
+  %2 = load i32, i32* %arrayidx4, align 4
+  %cmp5 = icmp ult i32 %1, %2
+  %spec.select = select i1 %cmp5, i32 %add3, i32 %add39
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.then, %while.body
+  %k.0 = phi i32 [ %add39, %while.body ], [ %spec.select, %if.then ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %0, i32 %i.addr.038
+  %3 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %0, i32 %k.0
+  %4 = load i32, i32* %arrayidx9, align 4
+  %cmp10 = icmp ult i32 %3, %4
+  br i1 %cmp10, label %if.then11, label %if.end17
+
+if.then11:                                        ; preds = %if.end7
+  store i32 %3, i32* %arrayidx9, align 4
+  store i32 %4, i32* %arrayidx8, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.end7, %if.then11
+  %i.addr.1 = phi i32 [ %k.0, %if.then11 ], [ %add16, %if.end7 ]
+  %add = shl i32 %i.addr.1, 1
+  %cmp.not = icmp ugt i32 %add, %j
+  br i1 %cmp.not, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end17, %entry
+  ret void
+}
+
+attributes #0 = { "target-cpu"="hexagonv65" }