[llvm] [AArch64] MI Scheduler: create more LDP/STP pairs (PR #77565)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 10 00:11:25 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

<details>
<summary>Changes</summary>

Target hook `canPairLdStOpc` is missing quite a few opcodes for which
LDPs/STPs can created. I was hoping that it would not be necessary to
add these missing opcodes here and that the attached motivating test
case would be handled by the LoadStoreOptimiser (especially after
 https://github.com/llvm/llvm-project/pull/71908), but it's not. The problem is that after register allocation
some things are a lot harder to do. Consider this for the motivating example

```
[1] renamable $q1 = LDURQi renamable $x9, -16 :: (load (s128) from %ir.r51, align 8, !tbaa !0)
[2] renamable $q2 = LDURQi renamable $x0, -16 :: (load (s128) from %ir.r53, align 8, !tbaa !4)
[3] renamable $q1 = nnan ninf nsz arcp contract afn reassoc nofpexcept FMLSv2f64 killed renamable $q1(tied-def 0), killed renamable $q2, renamable $q0, implicit $fpcr
[4] STURQi killed renamable $q1, renamable $x9, -16 :: (store (s128) into %ir.r51, align 1, !tbaa !0)
[5] renamable $q1 = LDRQui renamable $x9, 0 :: (load (s128) from %ir.r.G0001_609.0, align 8, !tbaa !0)
```
We can't combine the the load in line [5] into the load on [1]: regisister q1 is
used in between. And we can can't combine [1] into [5]: it is aliasing with
the STR on line [4].

That's why I thought that adding this opcode pair here to the MI scheduler was
a good compromise.

---
Full diff: https://github.com/llvm/llvm-project/pull/77565.diff


4 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+3) 
- (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+55-20) 
- (modified) llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll (+41) 
- (modified) llvm/test/CodeGen/AArch64/machine-combiner-copy.ll (+4-4) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1cfbf4737a6f72..42b7a6418032ad 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   switch (FirstOpc) {
   default:
     return false;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
   case AArch64::LDRWui:
   case AArch64::LDURWi:
     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index b435b3ce03e7ee..1551dc4435cea0 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1326,9 +1326,12 @@ static int alignTo(int Num, int PowOf2) {
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      AliasAnalysis *AA) {
-  for (MachineInstr *MIb : MemInsns)
-    if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
+  for (MachineInstr *MIb : MemInsns) {
+    if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
+      LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
       return true;
+    }
+  }
 
   return false;
 }
@@ -1525,7 +1528,7 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
   // * collect the registers used and required register classes for RegToRename.
   std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
                                                            bool IsDef) {
-    LLVM_DEBUG(dbgs() << "Checking " << MI);
+    LLVM_DEBUG(dbgs() << "1.Checking " << MI);
     // Currently we do not try to rename across frame-setup instructions.
     if (MI.getFlag(MachineInstr::FrameSetup)) {
       LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions "
@@ -1612,7 +1615,7 @@ static bool canRenameUntilSecondLoad(
   bool Success = std::all_of(
       FirstLoad.getIterator(), SecondLoad.getIterator(),
       [&](MachineInstr &MI) {
-        LLVM_DEBUG(dbgs() << "Checking " << MI);
+        LLVM_DEBUG(dbgs() << "2.Checking " << MI);
         // Currently we do not try to rename across frame-setup instructions.
         if (MI.getFlag(MachineInstr::FrameSetup)) {
           LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions "
@@ -1757,9 +1760,11 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   // Remember any instructions that read/write memory between FirstMI and MI.
   SmallVector<MachineInstr *, 4> MemInsns;
 
+  LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
   for (unsigned Count = 0; MBBI != E && Count < Limit;
        MBBI = next_nodbg(MBBI, E)) {
     MachineInstr &MI = *MBBI;
+    LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
 
     UsedInBetween.accumulate(MI);
 
@@ -1859,6 +1864,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
                                               UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
+            LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
+                              << "keep looking.\n");
             continue;
           }
           // If the alignment requirements of the paired (scaled) instruction
@@ -1868,6 +1875,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
                                               UsedRegUnits, TRI);
             MemInsns.push_back(&MI);
+            LLVM_DEBUG(dbgs()
+                       << "Offset doesn't fit due to alignment requirements, "
+                       << "keep looking.\n");
             continue;
           }
         }
@@ -1884,14 +1894,25 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
                                                 Reg, getLdStRegOp(MI).getReg());
 
-        // If the Rt of the second instruction was not modified or used between
-        // the two instructions and none of the instructions between the second
-        // and first alias with the second, we can combine the second into the
-        // first.
-        if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
-            !(MI.mayLoad() && !SameLoadReg &&
-              !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
-            !mayAlias(MI, MemInsns, AA)) {
+        // If the Rt of the second instruction (destination register of the
+        // load) was not modified or used between the two instructions and none
+        // of the instructions between the second and first alias with the
+        // second, we can combine the second into the first.
+        bool RtNotModified =
+            ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
+        bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
+                           !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
+        bool DontAlias = !mayAlias(MI, MemInsns, AA);
+
+        LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
+                          << "Reg '" << getLdStRegOp(MI) << "' not modified: "
+                          << (RtNotModified ? "true" : "false") << "\n"
+                          << "Reg '" << getLdStRegOp(MI) << "' not used: "
+                          << (RtNotUsed ? "true" : "false") << "\n"
+                          << "Does not alias with other mem insns: "
+                          << (DontAlias ? "true" : "false") << "\n");
+
+        if (RtNotModified && RtNotUsed && DontAlias) {
           // For pairs loading into the same reg, try to find a renaming
           // opportunity to allow the renaming of Reg between FirstMI and MI
           // and combine MI into FirstMI; otherwise bail and keep looking.
@@ -1904,6 +1925,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
               LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
                                                 UsedRegUnits, TRI);
               MemInsns.push_back(&MI);
+              LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
+                                << "keep looking.\n");
               continue;
             }
             Flags.setRenameReg(*RenameReg);
@@ -1919,10 +1942,18 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // between the two instructions and none of the instructions between the
         // first and the second alias with the first, we can combine the first
         // into the second.
-        if (!(MayLoad &&
-              !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
-            !mayAlias(FirstMI, MemInsns, AA)) {
-
+        RtNotModified = !(
+            MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
+        DontAlias = !mayAlias(FirstMI, MemInsns, AA);
+
+        LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
+                          << "Reg '" << getLdStRegOp(FirstMI)
+                          << "' not modified: "
+                          << (RtNotModified ? "true" : "false") << "\n"
+                          << "Does not alias with other mem insns: "
+                          << (DontAlias ? "true" : "false") << "\n");
+
+        if (RtNotModified && DontAlias) {
           if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
             Flags.setMergeForward(true);
             Flags.clearRenameReg();
@@ -1938,8 +1969,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
             MBBIWithRenameReg = MBBI;
           }
         }
-        // Unable to combine these instructions due to interference in between.
-        // Keep looking.
+        LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
+                          << "interference in between, keep looking.\n");
       }
     }
 
@@ -1948,16 +1979,20 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
     // If the instruction wasn't a matching load or store.  Stop searching if we
     // encounter a call instruction that might modify memory.
-    if (MI.isCall())
+    if (MI.isCall()) {
+      LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
       return E;
+    }
 
     // Update modified / uses register units.
     LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
 
     // Otherwise, if the base register is modified, we have no match, so
     // return early.
-    if (!ModifiedRegUnits.available(BaseReg))
+    if (!ModifiedRegUnits.available(BaseReg)) {
+      LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
       return E;
+    }
 
     // Update list of instructions that read/write memory.
     if (MI.mayLoadOrStore())
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 4fa34e846b2061..83f86d1c3a7cbf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -113,3 +113,44 @@ define <2 x i64> @ldq_cluster(ptr %p) {
   %res  = mul nsw <2 x i64> %tmp2, %tmp3
   ret <2 x i64> %res
 }
+
+; Test LDURQi / LDRQui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURQi_LDRQui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(0) - SU(4)
+; CHECK: Cluster ld/st SU(1) - SU(5)
+;
+; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui
+; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui
+;
+define void @LDURQi_LDRQui(ptr nocapture readonly %arg) {
+entry:
+  br label %vector_body
+vector_body:
+  %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+  %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+  %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+  %r51 = getelementptr i8, ptr %phi1, i64 -16
+  %r52 = load <2 x double>, ptr %r51, align 8
+  %r53 = getelementptr i8, ptr %phi2, i64 -16
+  %r54 = load <2 x double>, ptr %r53, align 8
+  %r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0>
+  %r56 = fsub fast <2 x double> %r52, %r55
+  store <2 x double> %r56, ptr %r51, align 1
+  %r57 = load <2 x double>, ptr %phi1, align 8
+  %r58 = load <2 x double>, ptr %phi2, align 8
+  %r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0>
+  %r60 = fsub fast <2 x double> %r57, %r59
+  store <2 x double> %r60, ptr %phi1, align 1
+  %r61 = add i32 %phi3, 4
+  %r62 = getelementptr i8, ptr %phi2, i64 32
+  %r63 = getelementptr i8, ptr %phi1, i64 32
+  %r.not = icmp eq i32 %r61, 0
+  br i1 %r.not, label %exit, label %vector_body
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 15a484d11b0a1d..4c8e589391c3ac 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -20,13 +20,13 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    mov x12, x9
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q3, [x11, #-16]
+; CHECK-NEXT:    ldp q1, q4, [x10, #-16]
 ; CHECK-NEXT:    subs x12, x12, #16
-; CHECK-NEXT:    ldp q2, q4, [x10, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x11, #-16]
 ; CHECK-NEXT:    add x11, x11, #32
-; CHECK-NEXT:    fmla v2.8h, v1.8h, v0.h[0]
+; CHECK-NEXT:    fmla v1.8h, v2.8h, v0.h[0]
 ; CHECK-NEXT:    fmla v4.8h, v3.8h, v0.h[0]
-; CHECK-NEXT:    stp q2, q4, [x10, #-16]
+; CHECK-NEXT:    stp q1, q4, [x10, #-16]
 ; CHECK-NEXT:    add x10, x10, #32
 ; CHECK-NEXT:    b.ne .LBB0_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block

``````````

</details>


https://github.com/llvm/llvm-project/pull/77565


More information about the llvm-commits mailing list