[llvm] 31f02ac - [ARM] Use mov operand if the mov cannot be moved while tail predicating

Tue Aug 18 09:10:38 PDT 2020

Author: Sam Tebbs
Date: 2020-08-18T17:10:29+01:00
New Revision: 31f02ac60aa8e89c04617e82fa2b1140e33e824d

URL: https://github.com/llvm/llvm-project/commit/31f02ac60aa8e89c04617e82fa2b1140e33e824d
DIFF: https://github.com/llvm/llvm-project/commit/31f02ac60aa8e89c04617e82fa2b1140e33e824d.diff

LOG: [ARM] Use mov operand if the mov cannot be moved while tail predicating

There are some cases where the instruction that sets up the iteration
count for a tail predicated loop cannot be moved before the dlstp,
stopping tail predication entirely. This patch checks if the mov operand
can be used and if so, uses that instead.

Differential Revision: https://reviews.llvm.org/D86087

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll

Modified: 
    llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index be75d6bef08c..2e7cd412db1c 100644

--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -226,6 +226,7 @@ namespace {
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineInstr *VCTP = nullptr;
+    MachineOperand TPNumElements;
     SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
     VPTBlock *CurrentBlock = nullptr;
     SetVector<MachineInstr*> CurrentPredicate;
@@ -239,7 +240,8 @@ namespace {
     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
                     const ARMBaseInstrInfo &TII)
-      : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+        : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
+          TPNumElements(MachineOperand::CreateImm(0)) {
       MF = ML.getHeader()->getParent();
       if (auto *MBB = ML.getLoopPreheader())
         Preheader = MBB;
@@ -291,11 +293,10 @@ namespace {
 
     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
 
-    // Return the loop iteration count, or the number of elements if we're tail
-    // predicating.
-    MachineOperand &getCount() {
-      return IsTailPredicationLegal() ?
-        VCTP->getOperand(1) : Start->getOperand(0);
+    // Return the operand for the loop start instruction. This will be the loop
+    // iteration count, or the number of elements if we're tail predicating.
+    MachineOperand &getLoopStartOperand() {
+      return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0);
     }
 
     unsigned getStartOpcode() const {
@@ -453,7 +454,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
-  Register NumElements = VCTP->getOperand(1).getReg();
+  TPNumElements = VCTP->getOperand(1);
+  Register NumElements = TPNumElements.getReg();
 
   // If the register is defined within loop, then we can't perform TP.
   // TODO: Check whether this is just a mov of a register that would be
@@ -466,9 +468,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // The element count register maybe defined after InsertPt, in which case we
   // need to try to move either InsertPt or the def so that the [w|d]lstp can
   // use the value.
-  // TODO: On failing to move an instruction, check if the count is provided by
-  // a mov and whether we can use the mov operand directly.
   MachineBasicBlock *InsertBB = StartInsertPt->getParent();
+
   if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
     if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
       if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
@@ -482,9 +483,21 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
                               StartInsertPt);
         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
       } else {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
-                   << "start instruction.\n");
-        return false;
+        // If we fail to move an instruction and the element count is provided
+        // by a mov, use the mov operand if it will have the same value at the
+        // insertion point
+        MachineOperand Operand = ElemDef->getOperand(1);
+        if (isMovRegOpcode(ElemDef->getOpcode()) &&
+            RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) ==
+                RDA.getUniqueReachingMIDef(StartInsertPt, Operand.getReg())) {
+          TPNumElements = Operand;
+          NumElements = TPNumElements.getReg();
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "ARM Loops: Unable to move element count to loop "
+                     << "start instruction.\n");
+          return false;
+        }
       }
     }
   }

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
new file mode 100644
index 000000000000..9a5856335dfc
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@@ -0,0 +1,269 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s
+
+--- |
+  define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 {
+  entry:
+    %0 = add i32 %blockSize, 3
+    %1 = icmp slt i32 %blockSize, 4
+    %smin = select i1 %1, i32 %blockSize, i32 4
+    %2 = sub i32 %0, %smin
+    %3 = lshr i32 %2, 2
+    %4 = add nuw nsw i32 %3, 1
+    %5 = icmp slt i32 %blockSize, 4
+    %smin3 = select i1 %5, i32 %blockSize, i32 4
+    %6 = sub i32 %0, %smin3
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    call void @llvm.set.loop.iterations.i32(i32 %8)
+    br label %do.body.i
+
+  do.body.i:                                        ; preds = %do.body.i, %entry
+    %blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ]
+    %sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ]
+    %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
+    %9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ]
+    %pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>*
+    %10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
+    %11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer)
+    %12 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %11, <4 x i1> %10, <4 x float> %sumVec.0.i)
+    %add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
+    %13 = add i32 %blkCnt.0.i, -4
+    %14 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1)
+    %15 = icmp ne i32 %14, 0
+    br i1 %15, label %do.body.i, label %arm_mean_f32_mve.exit
+
+  arm_mean_f32_mve.exit:                            ; preds = %do.body.i
+    %16 = extractelement <4 x float> %12, i32 3
+    %add2.i.i = fadd fast float %16, %16
+    %conv.i = uitofp i32 %blockSize to float
+    %div.i = fdiv fast float %add2.i.i, %conv.i
+    %17 = bitcast float %div.i to i32
+    %18 = insertelement <4 x i32> undef, i32 %17, i64 0
+    %19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer
+    %20 = bitcast <4 x i32> %19 to <4 x float>
+    call void @llvm.set.loop.iterations.i32(i32 %4)
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
+    %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ]
+    %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ]
+    %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
+    %21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
+    %pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>*
+    %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+    %23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer)
+    %24 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %23, <4 x float> %20, <4 x i1> %22, <4 x float> undef)
+    %25 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %24, <4 x float> %24, <4 x float> %sumVec.0, <4 x i1> %22)
+    %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+    %26 = add i32 %blkCnt.0, -4
+    %27 = call i32 @llvm.loop.decrement.reg.i32(i32 %21, i32 1)
+    %28 = icmp ne i32 %27, 0
+    br i1 %28, label %do.body, label %do.end
+
+  do.end:                                           ; preds = %do.body
+    %29 = extractelement <4 x float> %25, i32 3
+    %add2.i = fadd fast float %29, %29
+    %sub2 = add i32 %blockSize, -1
+    %conv = uitofp i32 %sub2 to float
+    %div = fdiv fast float %add2.i, %conv
+    store float %div, float* %pResult, align 4
+    ret void
+  }
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+
+  ; Function Attrs: argmemonly nounwind readonly willreturn
+  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3
+
+  attributes #0 = { "target-features"="+mve.fp" }
+  attributes #1 = { nounwind readnone "target-features"="+mve.fp" }
+  attributes #2 = { argmemonly nounwind readonly willreturn "target-features"="+mve.fp" }
+  attributes #3 = { noduplicate nounwind }
+
+...
+---
+name:            arm_var_f32_mve
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: arm_var_f32_mve
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r4
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
+  ; CHECK: bb.1.do.body.i:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r12
+  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.arm_mean_f32_mve.exit:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   liveins: $q0, $r0, $r1, $r2
+  ; CHECK:   $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 $r1
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
+  ; CHECK:   renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
+  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+  ; CHECK: bb.3.do.body:
+  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4)
+  ; CHECK:   renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.3
+  ; CHECK: bb.4.do.end:
+  ; CHECK:   liveins: $q0, $r1, $r2
+  ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
+  ; CHECK:   $s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
+  ; CHECK:   VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r4, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+    tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 10, 8, implicit-def $itstate
+    renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
+    renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+    $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+    $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
+
+  bb.1.do.body.i:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    MVE_VPST 4, implicit $vpr
+    renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+    renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.arm_mean_f32_mve.exit:
+    successors: %bb.3(0x80000000)
+    liveins: $q0, $r0, $r1, $r2, $r4
+
+    $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
+    t2DoLoopStart killed $r4
+    renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
+    renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+
+  bb.3.do.body:
+    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+    liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    MVE_VPST 2, implicit $vpr
+    renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.01, align 4)
+    renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2
+    renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, renamable $q2, 1, killed renamable $vpr
+    t2LoopEnd renamable $lr, %bb.3, implicit-def dead $cpsr
+    tB %bb.4, 14 /* CC::al */, $noreg
+
+  bb.4.do.end:
+    liveins: $q0, $r1, $r2
+
+    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
+    $s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
+    renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
+    VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
new file mode 100644
index 000000000000..63a2c0233f6e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) {
+; CHECK-LABEL:  .LBB0_1: @ %do.body.i
+; CHECK:    dlstp.32 lr, r1
+; CHECK-NEXT:    vadd.f32 s0, s3, s3
+; CHECK-NEXT:    vcvt.f32.u32 s4, s4
+; CHECK-NEXT:    vdiv.f32 s0, s0, s4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vdup.32 q1, r3
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:  .LBB0_3: @ %do.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    vsub.f32 q2, q2, q1
+; CHECK-NEXT:    vfma.f32 q0, q2, q2
+; CHECK-NEXT:    letp lr, .LBB0_3
+entry:
+  br label %do.body.i
+
+do.body.i:                                        ; preds = %entry, %do.body.i
+  %blkCnt.0.i = phi i32 [ %sub.i, %do.body.i ], [ %blockSize, %entry ]
+  %sumVec.0.i = phi <4 x float> [ %3, %do.body.i ], [ zeroinitializer, %entry ]
+  %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
+  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
+  %1 = bitcast float* %pSrc.addr.0.i to <4 x float>*
+  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
+  %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %2, <4 x i1> %0, <4 x float> %sumVec.0.i)
+  %sub.i = add nsw i32 %blkCnt.0.i, -4
+  %add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
+  %cmp.i = icmp sgt i32 %blkCnt.0.i, 4
+  br i1 %cmp.i, label %do.body.i, label %arm_mean_f32_mve.exit
+
+arm_mean_f32_mve.exit:                            ; preds = %do.body.i
+  %4 = extractelement <4 x float> %3, i32 3
+  %add2.i.i = fadd fast float %4, %4
+  %conv.i = uitofp i32 %blockSize to float
+  %div.i = fdiv fast float %add2.i.i, %conv.i
+  %.splatinsert = insertelement <4 x float> undef, float %div.i, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
+  %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %sub, %do.body ]
+  %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %9, %do.body ]
+  %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
+  %5 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+  %6 = bitcast float* %pSrc.addr.0 to <4 x float>*
+  %7 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %5, <4 x float> zeroinitializer)
+  %8 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %7, <4 x float> %.splat, <4 x i1> %5, <4 x float> undef)
+  %9 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %8, <4 x float> %8, <4 x float> %sumVec.0, <4 x i1> %5)
+  %sub = add nsw i32 %blkCnt.0, -4
+  %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+  %cmp1 = icmp sgt i32 %blkCnt.0, 4
+  br i1 %cmp1, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %10 = extractelement <4 x float> %9, i32 3
+  %add2.i = fadd fast float %10, %10
+  %sub2 = add i32 %blockSize, -1
+  %conv = uitofp i32 %sub2 to float
+  %div = fdiv fast float %add2.i, %conv
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %do.end
+  store float %div, float* %pResult, align 4
+  ret void
+}
+
+declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+
+declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+