[llvm] 40a3f7e - [ARM][LowOverheadLoops] Merge a VCMP and the new VPST into a VPT

Mon Nov 9 07:04:45 PST 2020

Author: Sam Tebbs
Date: 2020-11-09T15:03:48Z
New Revision: 40a3f7e48d6bac6702357636a5f5c351415ed050

URL: https://github.com/llvm/llvm-project/commit/40a3f7e48d6bac6702357636a5f5c351415ed050
DIFF: https://github.com/llvm/llvm-project/commit/40a3f7e48d6bac6702357636a5f5c351415ed050.diff

LOG: [ARM][LowOverheadLoops] Merge a VCMP and the new VPST into a VPT

There were cases where a VCMP and a VPST were merged even if the VCMP
didn't have the same defs of its operands as the VPST. This is fixed by
adding RDA checks for the defs. This however gave rise to cases where
the new VPST created would precede the un-merged VCMP and so would fail
a predicate mask assertion since the VCMP wasn't predicated. This was
solved by converting the VCMP to a VPT instead of inserting the new
VPST.

Differential Revision: https://reviews.llvm.org/D90461

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index d0e362f8adaa..0ec47bade34b 100644

--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1530,22 +1530,25 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         // TODO: We could be producing more VPT blocks than necessary and could
         // fold the newly created one into a proceeding one.
         MachineInstr *Divergent = VPTState::getDivergent(Block);
-        for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
-             E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
+        MachineInstr *VPST = Insts.front();
+        auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
+        bool DivergentNextIsPredicated =
+            getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
+
+        for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
+             I != E; ++I)
           RemovePredicate(&*I);
 
         // Check if the instruction defining vpr is a vcmp so it can be combined
         // with the VPST This should be the divergent instruction
-        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
-          ? Divergent
-          : nullptr;
-
-        MachineInstrBuilder MIB;
-        if (VCMP) {
-          // Combine the VPST and VCMP into a VPT
-          MIB = BuildMI(*Divergent->getParent(), Divergent,
-                        Divergent->getDebugLoc(),
-                        TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
+        MachineInstr *VCMP =
+            VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
+
+        auto ReplaceVCMPWithVPT = [&]() {
+          // Replace the VCMP with a VPT
+          MachineInstrBuilder MIB = BuildMI(
+              *Divergent->getParent(), Divergent, Divergent->getDebugLoc(),
+              TII->get(VCMPOpcodeToVPT(VCMP->getOpcode())));
           MIB.addImm(ARMVCC::Then);
           // Register one
           MIB.add(VCMP->getOperand(1));
@@ -1555,18 +1558,31 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
           MIB.add(VCMP->getOperand(3));
           LLVM_DEBUG(dbgs()
                      << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
+          LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
           LoLoop.ToRemove.insert(VCMP);
-        } else {
-          // Create a VPST (with a null mask for now, we'll recompute it later)
-          // or a VPT in case there was a VCMP right before it
-          MIB = BuildMI(*Divergent->getParent(), Divergent,
+        };
+
+        if (DivergentNextIsPredicated) {
+          // Insert a VPST at the divergent only if the next instruction
+          // would actually use it. A VCMP following a VPST can be
+          // merged into a VPT so do that instead if the VCMP exists.
+          if (!VCMP) {
+            // Create a VPST (with a null mask for now, we'll recompute it
+            // later)
+            MachineInstrBuilder MIB =
+                BuildMI(*Divergent->getParent(), Divergent,
                         Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
-          MIB.addImm(0);
-          LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+            MIB.addImm(0);
+            LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+            LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+          } else {
+            // No RDA checks are necessary here since the VPST would have been
+            // directly before the VCMP
+            ReplaceVCMPWithVPT();
+          }
         }
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
-        LoLoop.ToRemove.insert(Insts.front());
-        LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+        LoLoop.ToRemove.insert(VPST);
       }
     } else if (Block.containsVCTP()) {
       // The vctp will be removed, so the block mask of the vp(s)t will need

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index 222c2f036ca8..c8001df58e8c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=force-enabled-no-reductions -o - %s | FileCheck %s
 
 define arm_aapcs_vfpcc <16 x i8> @vcmp_vpst_combination(<16 x i8>* %pSrc, i16 zeroext %blockSize, i8* nocapture %pResult, i32* nocapture %pIndex) {
@@ -40,6 +41,70 @@ do.end:                                           ; preds = %do.body
   ret <16 x i8> %6
 }
 
+define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
+; CHECK-LABEL: vcmp_new_vpst_combination:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    blt .LBB1_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    dlstp.32 lr, r0
+; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q2, zr
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmovt q2, q1
+; CHECK-NEXT:    vaddva.u32 r2, q2
+; CHECK-NEXT:    letp lr, .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %len, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %len)
+  %0 = getelementptr inbounds i32, i32* %arr, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %2 = icmp ne <4 x i32> %wide.masked.load, zeroinitializer
+  %narrow = and <4 x i1> %active.lane.mask, %2
+  %3 = zext <4 x i1> %narrow to <4 x i32>
+  %4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %3)
+  %5 = add i32 %4, %vec.phi
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
+  ret i32 %count.0.lcssa
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+
 declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
 
 declare <16 x i1> @llvm.arm.mve.vctp8(i32)