[llvm] ce76093 - [ARM] Expand predecessor search to multiple blocks when reverting WhileLoopStarts

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri May 14 07:09:37 PDT 2021


Author: David Green
Date: 2021-05-14T15:08:14+01:00
New Revision: ce76093c3c860599ceb3d588fb80700cd69f1b46

URL: https://github.com/llvm/llvm-project/commit/ce76093c3c860599ceb3d588fb80700cd69f1b46
DIFF: https://github.com/llvm/llvm-project/commit/ce76093c3c860599ceb3d588fb80700cd69f1b46.diff

LOG: [ARM] Expand predecessor search to multiple blocks when reverting WhileLoopStarts

We were previously only searching a single preheader for call
instructions when reverting WhileLoopStarts to DoLoopStarts. This
extends that to multiple blocks that can come up when, for example a
loop is expanded from a memcpy. It also expends the instructions from
just Call's to also include other LoopStarts, to catch other low
overhead loops in the preheader.

Differential Revision: https://reviews.llvm.org/D102269

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir

Modified: 
    llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 3fd404e6d55b3..3ae8187720bc7 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -76,6 +76,8 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
   bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
   bool ConvertVPSEL(MachineBasicBlock &MBB);
   bool HintDoLoopStartReg(MachineBasicBlock &MBB);
+  MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
+                                            MachineInstr *LoopStart);
 };
 
 char MVETPAndVPTOptimisations::ID = 0;
@@ -253,6 +255,53 @@ bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
   return true;
 }
 
+// Return true if this instruction is invalid in a low overhead loop, usually
+// because it clobbers LR.
+static bool IsInvalidTPInstruction(MachineInstr &MI) {
+  return MI.isCall() || isLoopStart(MI);
+}
+
+// Starting from PreHeader, search for invalid instructions back until the
+// LoopStart block is reached. If invalid instructions are found, the loop start
+// is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
+// return the new DLS LoopStart if updated.
+MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
+    MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
+  SmallVector<MachineBasicBlock *> Worklist;
+  SmallPtrSet<MachineBasicBlock *, 4> Visited;
+  Worklist.push_back(PreHeader);
+  Visited.insert(LoopStart->getParent());
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    if (Visited.count(MBB))
+      continue;
+
+    for (MachineInstr &MI : *MBB) {
+      if (!IsInvalidTPInstruction(MI))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);
+
+      // Create a t2DoLoopStart at the end of the preheader.
+      MachineInstrBuilder MIB =
+          BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
+                  LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
+      MIB.add(LoopStart->getOperand(0));
+      MIB.add(LoopStart->getOperand(1));
+
+      // Revert the t2WhileLoopStartLR to a CMP and Br.
+      RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
+      return MIB;
+    }
+
+    Visited.insert(MBB);
+    for (auto *Pred : MBB->predecessors())
+      Worklist.push_back(Pred);
+  }
+  return LoopStart;
+}
+
 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
 // will be valid to be used for the low overhead loop, which means nothing else
@@ -275,29 +324,13 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
   // and if so revert it now before we get any further. While loops also need to
   // check the preheaders, but can be reverted to a DLS loop if needed.
   auto *PreHeader = ML->getLoopPreheader();
-  if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader &&
-      LoopStart->getParent() != PreHeader) {
-    for (MachineInstr &MI : *PreHeader) {
-      if (MI.isCall()) {
-        // Create a t2DoLoopStart at the end of the preheader.
-        MachineInstrBuilder MIB =
-            BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
-                    LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
-        MIB.add(LoopStart->getOperand(0));
-        MIB.add(LoopStart->getOperand(1));
-
-        // Revert the t2WhileLoopStartLR to a CMP and Br.
-        RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
-        LoopStart = MIB;
-        break;
-      }
-    }
-  }
+  if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
+    LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);
 
   for (MachineBasicBlock *MBB : ML->blocks()) {
     for (MachineInstr &MI : *MBB) {
-      if (MI.isCall()) {
-        LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
+      if (IsInvalidTPInstruction(MI)) {
+        LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
         if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
           RevertDoLoopStart(LoopStart, TII);
         else

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 3469dba948d33..8c8f67844257b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -276,6 +276,62 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
+; CHECK-LABEL: test_memset_preheader:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cbz r2, .LBB6_5
+; CHECK-NEXT:  @ %bb.1: @ %prehead
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    wlstp.8 lr, r3, .LBB6_3
+; CHECK-NEXT:  .LBB6_2: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrb.8 q0, [r12], #16
+; CHECK-NEXT:    letp lr, .LBB6_2
+; CHECK-NEXT:  .LBB6_3: @ %prehead
+; CHECK-NEXT:    dls lr, r2
+; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:  .LBB6_4: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrb r3, [r12], #1
+; CHECK-NEXT:    strb r3, [r1], #1
+; CHECK-NEXT:    le lr, .LBB6_4
+; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB6_7
+; CHECK-NEXT:  .LBB6_6: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB6_6
+; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp6 = icmp ne i32 %n, 0
+  br i1 %cmp6, label %prehead, label %for.cond.cleanup
+
+prehead:
+  call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
+  %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
+  %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
+  %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
+  %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
+  %l = load i8, i8* %x.addr.08
+  store i8 %l, i8* %y.addr.07
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false)
+  ret void
+}
+
+
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
 declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
new file mode 100644
index 0000000000000..234b112050d47
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wls-search-pred.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve %s -run-pass=arm-mve-vpt-opts --verify-machineinstrs -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+  define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
+  entry:
+    %cmp6 = icmp ne i32 %n, 0
+    %0 = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %n)
+    %1 = extractvalue { i32, i1 } %0, 1
+    %2 = extractvalue { i32, i1 } %0, 0
+    br i1 %1, label %prehead, label %for.cond.cleanup
+
+  prehead:                                          ; preds = %entry
+    call void @llvm.memset.p0i8.i32(i8* align 1 %x, i8 0, i32 %n, i1 false)
+    br label %for.body
+
+  for.body:                                         ; preds = %for.body, %prehead
+    %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
+    %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
+    %3 = phi i32 [ %2, %prehead ], [ %4, %for.body ]
+    %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
+    %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
+    %l = load i8, i8* %x.addr.08, align 1
+    store i8 %l, i8* %y.addr.07, align 1
+    %4 = call i32 @llvm.loop.decrement.reg.i32(i32 %3, i32 1)
+    %5 = icmp ne i32 %4, 0
+    br i1 %5, label %for.body, label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+  }
+
+  declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)
+  declare { i32, i1 } @llvm.test.start.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+
+...
+---
+name:            test_memset_preheader
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0', virtual-reg: '%7' }
+  - { reg: '$r1', virtual-reg: '%8' }
+  - { reg: '$r2', virtual-reg: '%9' }
+body:             |
+  ; CHECK-LABEL: name: test_memset_preheader
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   [[COPY:%[0-9]+]]:rgpr = COPY $r2
+  ; CHECK:   [[COPY1:%[0-9]+]]:gpr = COPY $r1
+  ; CHECK:   [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+  ; CHECK:   t2CMPri [[COPY]], 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.5, 0 /* CC::eq */, $cpsr
+  ; CHECK:   t2B %bb.1, 14 /* CC::al */, $noreg
+  ; CHECK: bb.1.prehead:
+  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   [[DEF:%[0-9]+]]:mqpr = IMPLICIT_DEF
+  ; CHECK:   [[MVE_VMOVimmi32_:%[0-9]+]]:mqpr = MVE_VMOVimmi32 0, 0, $noreg, [[DEF]]
+  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2WhileLoopStartLR:%[0-9]+]]:gprlr = t2WhileLoopStartLR killed [[t2LSRri]], %bb.3, implicit-def $cpsr
+  ; CHECK: bb.2:
+  ; CHECK:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK:   [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %11, %bb.2
+  ; CHECK:   [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopStartLR]], %bb.1, %13, %bb.2
+  ; CHECK:   [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %15, %bb.2
+  ; CHECK:   [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
+  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VMOVimmi32_]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
+  ; CHECK:   [[t2LoopEndDec:%[0-9]+]]:gprlr = t2LoopEndDec [[PHI1]], %bb.2, implicit-def $cpsr
+  ; CHECK:   t2B %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK: bb.3.prehead:
+  ; CHECK:   successors: %bb.4(0x80000000)
+  ; CHECK:   [[t2DoLoopStart:%[0-9]+]]:gprlr = t2DoLoopStart [[COPY]]
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.4.for.body:
+  ; CHECK:   successors: %bb.4(0x7c000000), %bb.5(0x04000000)
+  ; CHECK:   [[PHI3:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.3, %19, %bb.4
+  ; CHECK:   [[PHI4:%[0-9]+]]:gpr = PHI [[COPY1]], %bb.3, %21, %bb.4
+  ; CHECK:   [[PHI5:%[0-9]+]]:gprlr = PHI [[t2DoLoopStart]], %bb.3, %26, %bb.4
+  ; CHECK:   [[t2LDRB_POST:%[0-9]+]]:rgpr, [[t2LDRB_POST1:%[0-9]+]]:gpr = t2LDRB_POST [[PHI3]], 1, 14 /* CC::al */, $noreg :: (load 1 from %ir.x.addr.08)
+  ; CHECK:   early-clobber %25:gprnopc = t2STRB_POST killed [[t2LDRB_POST]], [[PHI4]], 1, 14 /* CC::al */, $noreg :: (store 1 into %ir.y.addr.07)
+  ; CHECK:   [[COPY3:%[0-9]+]]:gpr = COPY %25
+  ; CHECK:   [[t2LoopEndDec1:%[0-9]+]]:gprlr = t2LoopEndDec [[PHI5]], %bb.4, implicit-def $cpsr
+  ; CHECK:   t2B %bb.5, 14 /* CC::al */, $noreg
+  ; CHECK: bb.5.for.cond.cleanup:
+  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $r0, $r1, $r2
+
+    %9:rgpr = COPY $r2
+    %8:gpr = COPY $r1
+    %7:rgpr = COPY $r0
+    %10:gprlr = t2WhileLoopSetup %9
+    t2WhileLoopStart %10, %bb.3, implicit-def dead $cpsr
+    t2B %bb.1, 14 /* CC::al */, $noreg
+
+  bb.1.prehead:
+    successors: %bb.5(0x40000000), %bb.4(0x40000000)
+
+    %12:mqpr = IMPLICIT_DEF
+    %11:mqpr = MVE_VMOVimmi32 0, 0, $noreg, %12
+    %17:rgpr = t2ADDri %9, 15, 14 /* CC::al */, $noreg, $noreg
+    %18:rgpr = t2BICri killed %17, 16, 14 /* CC::al */, $noreg, $noreg
+    %19:gprlr = t2LSRri killed %18, 4, 14 /* CC::al */, $noreg, $noreg
+    %20:gprlr = t2WhileLoopSetup killed %19
+    t2WhileLoopStart %20, %bb.5, implicit-def $cpsr
+
+  bb.4:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %21:rgpr = PHI %7, %bb.1, %22, %bb.4
+    %23:gprlr = PHI %20, %bb.1, %24, %bb.4
+    %25:rgpr = PHI %9, %bb.1, %26, %bb.4
+    %27:vccr = MVE_VCTP8 %25, 0, $noreg
+    %26:rgpr = t2SUBri %25, 16, 14 /* CC::al */, $noreg, $noreg
+    %22:rgpr = MVE_VSTRBU8_post %11, %21, 16, 1, %27
+    %24:gprlr = t2LoopDec %23, 1
+    t2LoopEnd %24, %bb.4, implicit-def $cpsr
+    t2B %bb.5, 14 /* CC::al */, $noreg
+
+  bb.5.prehead:
+    successors: %bb.2(0x80000000)
+
+    %0:gpr = COPY %10
+    t2B %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.for.body:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+
+    %1:gpr = PHI %7, %bb.5, %4, %bb.2
+    %2:gpr = PHI %8, %bb.5, %5, %bb.2
+    %3:gprlr = PHI %0, %bb.5, %6, %bb.2
+    %13:rgpr, %4:gpr = t2LDRB_POST %1, 1, 14 /* CC::al */, $noreg :: (load 1 from %ir.x.addr.08)
+    early-clobber %14:gprnopc = t2STRB_POST killed %13, %2, 1, 14 /* CC::al */, $noreg :: (store 1 into %ir.y.addr.07)
+    %15:gprlr = t2LoopDec %3, 1
+    %5:gpr = COPY %14
+    %6:gpr = COPY %15
+    t2LoopEnd %15, %bb.2, implicit-def dead $cpsr
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.for.cond.cleanup:
+    tBX_RET 14 /* CC::al */, $noreg
+
+...


        


More information about the llvm-commits mailing list