[llvm] 6c2df5d - [ARM][LowOverheadLoops] Dont ignore VCTP

Mon Jan 27 02:59:43 PST 2020

Author: Sam Parker
Date: 2020-01-27T10:59:12Z
New Revision: 6c2df5d14f7adba1ec7decbece29162aa3a30861

URL: https://github.com/llvm/llvm-project/commit/6c2df5d14f7adba1ec7decbece29162aa3a30861
DIFF: https://github.com/llvm/llvm-project/commit/6c2df5d14f7adba1ec7decbece29162aa3a30861.diff

LOG: [ARM][LowOverheadLoops] Dont ignore VCTP

When expanding the LoopStart, we try to remove the iteration count
calculation. However, if part of the calculation was also used to
calculate the number of elements we could end up deleting
instructions that were required to feed DLSTP/WLSTP.

Differential Revision: https://reviews.llvm.org/D73275

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 132a8aca0cc7..4be631db6cf7 100644

--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -900,8 +900,7 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
       SmallPtrSet<MachineInstr*, 4> Visited;
       SmallPtrSet<MachineInstr*, 4> Remove;
       SmallPtrSet<MachineInstr*, 4> Ignore = { LoLoop.Start, LoLoop.Dec,
-                                               LoLoop.End, LoLoop.VCTP,
-                                               LoLoop.InsertPt };
+                                               LoLoop.End, LoLoop.InsertPt };
       SmallVector<MachineInstr*, 4> Chain = { Def };
       while (!Chain.empty()) {
         MachineInstr *MI = Chain.back();

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
new file mode 100644
index 000000000000..5998023a9b48
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+
+--- |
+  define hidden arm_aapcs_vfpcc void @dont_ignore_vctp(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 {
+  entry:
+    %mul = shl i32 %blockSize, 1
+    %0 = add i32 %mul, 3
+    %1 = icmp slt i32 %mul, 4
+    %smin = select i1 %1, i32 %mul, i32 4
+    %2 = sub i32 %0, %smin
+    %3 = lshr i32 %2, 2
+    %4 = add nuw nsw i32 %3, 1
+    call void @llvm.set.loop.iterations.i32(i32 %4)
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %entry
+    %blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
+    %pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
+    %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
+    %5 = phi i32 [ %4, %entry ], [ %9, %do.body ]
+    %6 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+    %input_cast = bitcast float* %pSrc.addr.0 to <4 x float>*
+    %7 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %input_cast, i32 4, <4 x i1> %6, <4 x float> undef)
+    %8 = fmul <4 x float> %7, <float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00>
+    %output_cast = bitcast float* %pDst.addr.0 to <4 x float>*
+    tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %8, <4 x float>* %output_cast, i32 4, <4 x i1> %6)
+    %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+    %add.ptr4 = getelementptr inbounds float, float* %pDst.addr.0, i32 4
+    %sub = add nsw i32 %blkCnt.0, -4
+    %9 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %5, i32 1)
+    %10 = icmp ne i32 %9, 0
+    br i1 %10, label %do.body, label %do.end
+
+  do.end:                                           ; preds = %do.body
+    ret void
+  }
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+  declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+
+...
+---
+name:            dont_ignore_vctp
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:
+  - id:              0
+    value:           '<4 x float> <float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00>'
+    alignment:       16
+    isTargetSpecific: false
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: dont_ignore_vctp
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   renamable $r3, dead $cpsr = tLSLri killed renamable $r2, 1, 14, $noreg
+  ; CHECK:   t2IT 11, 8, implicit-def dead $itstate
+  ; CHECK:   renamable $r2 = tLEApcrel %const.0, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
+  ; CHECK: bb.1.do.body (align 4):
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VLDRWU32 renamable $r0, 0, 0, $noreg
+  ; CHECK:   renamable $q1 = nnan ninf nsz MVE_VMULf32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
+  ; CHECK:   MVE_VSTRWU32 killed renamable $q1, renamable $r1, 0, 0, killed $noreg
+  ; CHECK:   renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 16, 14, $noreg
+  ; CHECK:   renamable $r1, dead $cpsr = nuw tADDi8 killed renamable $r1, 16, 14, $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.do.end:
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  ; CHECK: bb.3 (align 16):
+  ; CHECK:   CONSTPOOL_ENTRY 0, %const.0, 16
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    renamable $r3, dead $cpsr = tLSLri renamable $r2, 1, 14, $noreg
+    renamable $r12 = t2MOVi 4, 14, $noreg, $noreg
+    tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr
+    t2IT 11, 8, implicit-def $itstate
+    $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate
+    renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg
+    renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg
+    renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg
+    renamable $r2 = tLEApcrel %const.0, 14, $noreg
+    renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool)
+    t2DoLoopStart renamable $lr
+
+  bb.1.do.body (align 4):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    MVE_VPST 2, implicit $vpr
+    renamable $q1 = nnan ninf nsz MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr
+    renamable $q1 = nnan ninf nsz MVE_VMULf32 killed renamable $q1, renamable $q0, 1, renamable $vpr, undef renamable $q1
+    MVE_VSTRWU32 killed renamable $q1, renamable $r1, 0, 1, killed renamable $vpr
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 16, 14, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r1, dead $cpsr = nuw tADDi8 killed renamable $r1, 16, 14, $noreg
+    renamable $r3, dead $cpsr = nsw tSUBi8 killed renamable $r3, 4, 14, $noreg
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.do.end:
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+  bb.3 (align 16):
+    CONSTPOOL_ENTRY 0, %const.0, 16
+
+...