[llvm] 652faed - [CodeGen] Improve SelectionDAGBuilder lowering code for get.active.lane.mask intrinsic
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 10 05:39:44 PST 2021
Author: David Sherwood
Date: 2021-12-10T13:39:38Z
New Revision: 652faed3539eefad773bccd635fd01a037e0a4ce
URL: https://github.com/llvm/llvm-project/commit/652faed3539eefad773bccd635fd01a037e0a4ce
DIFF: https://github.com/llvm/llvm-project/commit/652faed3539eefad773bccd635fd01a037e0a4ce.diff
LOG: [CodeGen] Improve SelectionDAGBuilder lowering code for get.active.lane.mask intrinsic
Previously we were using UADDO to generate a two-result value with
the unsigned addition and the overflow mask. We then combined the
overflow mask with the trip count comparison to get a result.
However, we don't need to do this - we can simply use a UADDSAT
saturating add node to add the vector index splat and the stepvector
together. Then we can just compare this to a splat of the trip count.
This results in overall better code quality for both Thumb2 and AArch64.
Differential Revision: https://reviews.llvm.org/D115354
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/test/CodeGen/AArch64/active_lane_mask.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
llvm/test/CodeGen/Thumb2/active_lane_mask.ll
llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4f6f2780139ca..e511663337f3d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7129,12 +7129,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
SDValue VectorInduction = DAG.getNode(
- ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
- SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0),
+ ISD::UADDSAT, sdl, VecTy, VectorIndex, VectorStep);
+ SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction,
VectorTripCount, ISD::CondCode::SETULT);
- setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT,
- DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT),
- SetCC));
+ setValue(&I, SetCC);
return;
}
case Intrinsic::experimental_vector_insert: {
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index bc5eee7af0c6e..e04068d499877 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -76,14 +76,12 @@ define <vscale x 2 x i1> @lane_mask_nxv2i1_i64(i64 %index, i64 %TC) {
define <vscale x 16 x i1> @lane_mask_nxv16i1_i8(i8 %index, i8 %TC) {
; CHECK-LABEL: lane_mask_nxv16i1_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: index z0.b, w0, #1
+; CHECK-NEXT: index z0.b, #0, #1
; CHECK-NEXT: mov z1.b, w0
-; CHECK-NEXT: mov z2.b, w1
-; CHECK-NEXT: cmphi p1.b, p0/z, z1.b, z0.b
-; CHECK-NEXT: cmphi p2.b, p0/z, z2.b, z0.b
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: uqadd z0.b, z1.b, z0.b
+; CHECK-NEXT: mov z1.b, w1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: cmphi p0.b, p0/z, z1.b, z0.b
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i8(i8 %index, i8 %TC)
ret <vscale x 16 x i1> %active.lane.mask
@@ -97,15 +95,12 @@ define <vscale x 8 x i1> @lane_mask_nxv8i1_i8(i8 %index, i8 %TC) {
; CHECK-NEXT: and z0.h, z0.h, #0xff
; CHECK-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.h, w1
+; CHECK-NEXT: mov z1.h, w1
+; CHECK-NEXT: umin z0.h, z0.h, #255
; CHECK-NEXT: and z1.h, z1.h, #0xff
-; CHECK-NEXT: and z2.h, z2.h, #0xff
-; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT: cmphi p2.h, p0/z, z2.h, z1.h
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: cmphi p0.h, p0/z, z1.h, z0.h
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i8(i8 %index, i8 %TC)
ret <vscale x 8 x i1> %active.lane.mask
@@ -119,15 +114,12 @@ define <vscale x 4 x i1> @lane_mask_nxv4i1_i8(i8 %index, i8 %TC) {
; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: and z1.s, z1.s, #0xff
; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.s, w1
+; CHECK-NEXT: mov z1.s, w1
+; CHECK-NEXT: umin z0.s, z0.s, #255
; CHECK-NEXT: and z1.s, z1.s, #0xff
-; CHECK-NEXT: and z2.s, z2.s, #0xff
-; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, z0.s
-; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z1.s
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cmphi p0.s, p0/z, z1.s, z0.s
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i8(i8 %index, i8 %TC)
ret <vscale x 4 x i1> %active.lane.mask
@@ -144,14 +136,11 @@ define <vscale x 2 x i1> @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) {
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: mov z2.d, x1
-; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: and z1.d, z1.d, #0xff
+; CHECK-NEXT: umin z0.d, z0.d, #255
; CHECK-NEXT: and z2.d, z2.d, #0xff
-; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z0.d
-; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z1.d
-; CHECK-NEXT: not p1.b, p0/z, p1.b
-; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i8(i8 %index, i8 %TC)
ret <vscale x 2 x i1> %active.lane.mask
@@ -165,8 +154,6 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
@@ -174,60 +161,39 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
; CHECK-NEXT: index z0.s, #0, #1
; CHECK-NEXT: mov z3.s, w0
; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z4.s, w1
; CHECK-NEXT: incw z1.s
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: mov z6.d, z1.d
-; CHECK-NEXT: add z4.s, z3.s, z0.s
+; CHECK-NEXT: uqadd z5.s, z3.s, z0.s
; CHECK-NEXT: incw z2.s, all, mul #2
-; CHECK-NEXT: add z5.s, z3.s, z1.s
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: cmphi p1.s, p0/z, z4.s, z5.s
+; CHECK-NEXT: uqadd z5.s, z3.s, z1.s
+; CHECK-NEXT: cmphi p2.s, p0/z, z4.s, z5.s
+; CHECK-NEXT: uqadd z5.s, z3.s, z2.s
; CHECK-NEXT: incw z6.s, all, mul #2
-; CHECK-NEXT: cmphi p1.s, p0/z, z3.s, z4.s
-; CHECK-NEXT: cmphi p2.s, p0/z, z3.s, z5.s
-; CHECK-NEXT: add z7.s, z3.s, z2.s
-; CHECK-NEXT: add z25.s, z3.s, z6.s
-; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h
-; CHECK-NEXT: cmphi p2.s, p0/z, z3.s, z7.s
-; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z25.s
-; CHECK-NEXT: mov z24.s, w1
-; CHECK-NEXT: uzp1 p2.h, p2.h, p4.h
; CHECK-NEXT: incw z0.s, all, mul #4
+; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z5.s
+; CHECK-NEXT: uqadd z5.s, z3.s, z6.s
; CHECK-NEXT: incw z1.s, all, mul #4
+; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z5.s
+; CHECK-NEXT: uqadd z0.s, z3.s, z0.s
+; CHECK-NEXT: uqadd z1.s, z3.s, z1.s
; CHECK-NEXT: incw z2.s, all, mul #4
; CHECK-NEXT: incw z6.s, all, mul #4
-; CHECK-NEXT: cmphi p3.s, p0/z, z24.s, z5.s
-; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z4.s
-; CHECK-NEXT: cmphi p4.s, p0/z, z24.s, z7.s
-; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b
-; CHECK-NEXT: cmphi p2.s, p0/z, z24.s, z25.s
-; CHECK-NEXT: add z0.s, z3.s, z0.s
-; CHECK-NEXT: add z1.s, z3.s, z1.s
-; CHECK-NEXT: add z2.s, z3.s, z2.s
-; CHECK-NEXT: add z4.s, z3.s, z6.s
-; CHECK-NEXT: uzp1 p3.h, p5.h, p3.h
-; CHECK-NEXT: uzp1 p2.h, p4.h, p2.h
-; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z0.s
-; CHECK-NEXT: cmphi p5.s, p0/z, z3.s, z1.s
-; CHECK-NEXT: cmphi p6.s, p0/z, z3.s, z2.s
-; CHECK-NEXT: cmphi p7.s, p0/z, z3.s, z4.s
-; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h
-; CHECK-NEXT: uzp1 p5.h, p6.h, p7.h
-; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b
-; CHECK-NEXT: uzp1 p3.b, p4.b, p5.b
-; CHECK-NEXT: cmphi p4.s, p0/z, z24.s, z0.s
-; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z1.s
-; CHECK-NEXT: ptrue p6.b
-; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h
-; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z2.s
-; CHECK-NEXT: cmphi p0.s, p0/z, z24.s, z4.s
-; CHECK-NEXT: not p1.b, p6/z, p1.b
-; CHECK-NEXT: uzp1 p0.h, p5.h, p0.h
-; CHECK-NEXT: not p3.b, p6/z, p3.b
-; CHECK-NEXT: uzp1 p4.b, p4.b, p0.b
-; CHECK-NEXT: and p0.b, p6/z, p1.b, p2.b
-; CHECK-NEXT: and p1.b, p6/z, p3.b, p4.b
-; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h
+; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h
+; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z0.s
+; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z1.s
+; CHECK-NEXT: uqadd z0.s, z3.s, z2.s
+; CHECK-NEXT: uqadd z1.s, z3.s, z6.s
+; CHECK-NEXT: cmphi p5.s, p0/z, z4.s, z0.s
+; CHECK-NEXT: cmphi p0.s, p0/z, z4.s, z1.s
+; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h
+; CHECK-NEXT: uzp1 p4.h, p5.h, p0.h
+; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b
+; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
@@ -241,135 +207,90 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_nxv32i1_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-3
-; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: index z2.d, #0, #1
-; CHECK-NEXT: mov z0.d, x0
-; CHECK-NEXT: mov z3.d, z2.d
-; CHECK-NEXT: mov z4.d, z2.d
-; CHECK-NEXT: incd z3.d
-; CHECK-NEXT: incd z4.d, all, mul #2
-; CHECK-NEXT: mov z7.d, z3.d
+; CHECK-NEXT: index z0.d, #0, #1
+; CHECK-NEXT: mov z3.d, x0
+; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: incd z7.d, all, mul #2
-; CHECK-NEXT: add z5.d, z0.d, z2.d
-; CHECK-NEXT: add z6.d, z0.d, z3.d
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z4.d, x1
+; CHECK-NEXT: incd z1.d
+; CHECK-NEXT: uqadd z5.d, z3.d, z0.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z1.d
+; CHECK-NEXT: cmphi p1.d, p0/z, z4.d, z5.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: incd z2.d, all, mul #2
+; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z2.d
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: incd z5.d, all, mul #2
+; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s
+; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z5.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: incd z7.d, all, mul #4
+; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z7.d
; CHECK-NEXT: mov z25.d, z2.d
-; CHECK-NEXT: mov z26.d, z3.d
-; CHECK-NEXT: mov z28.d, z4.d
-; CHECK-NEXT: mov z29.d, z7.d
-; CHECK-NEXT: cmphi p1.d, p0/z, z0.d, z5.d
-; CHECK-NEXT: cmphi p2.d, p0/z, z0.d, z6.d
-; CHECK-NEXT: add z24.d, z0.d, z4.d
-; CHECK-NEXT: add z27.d, z0.d, z7.d
+; CHECK-NEXT: incd z24.d, all, mul #4
+; CHECK-NEXT: mov z26.d, z5.d
+; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z24.d
; CHECK-NEXT: incd z25.d, all, mul #4
+; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z25.d
; CHECK-NEXT: incd z26.d, all, mul #4
-; CHECK-NEXT: incd z28.d, all, mul #4
-; CHECK-NEXT: incd z29.d, all, mul #4
-; CHECK-NEXT: cmphi p3.d, p0/z, z0.d, z24.d
-; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT: cmphi p2.d, p0/z, z0.d, z27.d
-; CHECK-NEXT: add z30.d, z0.d, z25.d
-; CHECK-NEXT: add z31.d, z0.d, z26.d
-; CHECK-NEXT: add z8.d, z0.d, z28.d
-; CHECK-NEXT: add z9.d, z0.d, z29.d
-; CHECK-NEXT: uzp1 p2.s, p3.s, p2.s
-; CHECK-NEXT: cmphi p3.d, p0/z, z0.d, z30.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z0.d, z31.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z0.d, z8.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z9.d
-; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s
-; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s
-; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h
-; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h
-; CHECK-NEXT: mov z1.d, x1
-; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b
-; CHECK-NEXT: cmphi p2.d, p0/z, z1.d, z6.d
-; CHECK-NEXT: cmphi p3.d, p0/z, z1.d, z5.d
-; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z24.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z27.d
-; CHECK-NEXT: uzp1 p2.s, p3.s, p2.s
-; CHECK-NEXT: uzp1 p3.s, p4.s, p5.s
-; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z30.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z31.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z8.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z9.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: uqadd z6.d, z3.d, z26.d
+; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s
+; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d
+; CHECK-NEXT: incd z0.d, all, mul #8
+; CHECK-NEXT: incd z1.d, all, mul #8
+; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s
+; CHECK-NEXT: uzp1 p3.s, p6.s, p3.s
+; CHECK-NEXT: uqadd z0.d, z3.d, z0.d
+; CHECK-NEXT: uqadd z1.d, z3.d, z1.d
; CHECK-NEXT: incd z2.d, all, mul #8
-; CHECK-NEXT: incd z3.d, all, mul #8
-; CHECK-NEXT: incd z4.d, all, mul #8
+; CHECK-NEXT: incd z5.d, all, mul #8
+; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h
+; CHECK-NEXT: uzp1 p2.h, p4.h, p3.h
+; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z0.d
+; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z1.d
+; CHECK-NEXT: uqadd z0.d, z3.d, z2.d
+; CHECK-NEXT: uqadd z1.d, z3.d, z5.d
; CHECK-NEXT: incd z7.d, all, mul #8
-; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s
-; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s
-; CHECK-NEXT: add z2.d, z0.d, z2.d
-; CHECK-NEXT: add z3.d, z0.d, z3.d
-; CHECK-NEXT: add z4.d, z0.d, z4.d
-; CHECK-NEXT: add z5.d, z0.d, z7.d
+; CHECK-NEXT: incd z24.d, all, mul #8
+; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d
+; CHECK-NEXT: uqadd z0.d, z3.d, z7.d
+; CHECK-NEXT: uqadd z1.d, z3.d, z24.d
; CHECK-NEXT: incd z25.d, all, mul #8
; CHECK-NEXT: incd z26.d, all, mul #8
-; CHECK-NEXT: incd z28.d, all, mul #8
-; CHECK-NEXT: incd z29.d, all, mul #8
-; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h
-; CHECK-NEXT: uzp1 p3.h, p4.h, p5.h
-; CHECK-NEXT: cmphi p4.d, p0/z, z0.d, z2.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z0.d, z3.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z4.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z0.d, z5.d
-; CHECK-NEXT: add z6.d, z0.d, z25.d
-; CHECK-NEXT: add z7.d, z0.d, z26.d
-; CHECK-NEXT: add z24.d, z0.d, z28.d
-; CHECK-NEXT: add z25.d, z0.d, z29.d
-; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s
-; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s
-; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z6.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z0.d, z7.d
-; CHECK-NEXT: cmphi p8.d, p0/z, z0.d, z24.d
-; CHECK-NEXT: cmphi p9.d, p0/z, z0.d, z25.d
-; CHECK-NEXT: uzp1 p6.s, p6.s, p7.s
-; CHECK-NEXT: uzp1 p7.s, p8.s, p9.s
-; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h
-; CHECK-NEXT: uzp1 p5.h, p6.h, p7.h
-; CHECK-NEXT: uzp1 p2.b, p2.b, p3.b
-; CHECK-NEXT: uzp1 p3.b, p4.b, p5.b
-; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z2.d
-; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z3.d
-; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z4.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z5.d
-; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s
-; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s
-; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z6.d
-; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z7.d
-; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h
-; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s
-; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z24.d
-; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z25.d
-; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: uzp1 p0.s, p6.s, p0.s
-; CHECK-NEXT: ptrue p6.b
-; CHECK-NEXT: uzp1 p0.h, p5.h, p0.h
-; CHECK-NEXT: not p1.b, p6/z, p1.b
-; CHECK-NEXT: not p3.b, p6/z, p3.b
-; CHECK-NEXT: uzp1 p4.b, p4.b, p0.b
-; CHECK-NEXT: and p0.b, p6/z, p1.b, p2.b
-; CHECK-NEXT: and p1.b, p6/z, p3.b, p4.b
-; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s
+; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s
+; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d
+; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d
+; CHECK-NEXT: uqadd z0.d, z3.d, z25.d
+; CHECK-NEXT: uqadd z1.d, z3.d, z26.d
+; CHECK-NEXT: cmphi p7.d, p0/z, z4.d, z0.d
+; CHECK-NEXT: cmphi p0.d, p0/z, z4.d, z1.d
+; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s
+; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s
+; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h
+; CHECK-NEXT: uzp1 p4.h, p5.h, p0.h
+; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b
+; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b
; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %TC)
@@ -379,31 +300,17 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) {
define <vscale x 32 x i1> @lane_mask_nxv32i1_i8(i8 %index, i8 %TC) {
; CHECK-LABEL: lane_mask_nxv32i1_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: index z1.b, #0, #1
-; CHECK-NEXT: mov z0.b, w8
+; CHECK-NEXT: index z0.b, #0, #1
+; CHECK-NEXT: mov z1.b, w8
; CHECK-NEXT: mov z2.b, w0
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: add z4.b, z2.b, z1.b
-; CHECK-NEXT: add z0.b, z2.b, z0.b
+; CHECK-NEXT: add z1.b, z0.b, z1.b
; CHECK-NEXT: mov z3.b, w1
-; CHECK-NEXT: cmphi p2.b, p1/z, z2.b, z4.b
-; CHECK-NEXT: cmphi p3.b, p1/z, z2.b, z0.b
-; CHECK-NEXT: cmphi p0.b, p1/z, z3.b, z4.b
-; CHECK-NEXT: not p2.b, p1/z, p2.b
-; CHECK-NEXT: cmphi p4.b, p1/z, z3.b, z0.b
-; CHECK-NEXT: not p3.b, p1/z, p3.b
-; CHECK-NEXT: and p0.b, p1/z, p2.b, p0.b
-; CHECK-NEXT: and p1.b, p1/z, p3.b, p4.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: uqadd z0.b, z2.b, z0.b
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: uqadd z1.b, z2.b, z1.b
+; CHECK-NEXT: cmphi p0.b, p1/z, z3.b, z0.b
+; CHECK-NEXT: cmphi p1.b, p1/z, z3.b, z1.b
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i8(i8 %index, i8 %TC)
ret <vscale x 32 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
index 821ba084adb03..580485874f5fc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
@@ -15,13 +15,9 @@ define dso_local i32 @test_500_504(i32* nocapture readonly %x) {
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q0, r1
-; CHECK-NEXT: vdup.32 q3, r1
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
+; CHECK-NEXT: vqadd.u32 q2, q0, r1
; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpsttt
-; CHECK-NEXT: vcmpt.u32 hi, q1, q2
+; CHECK-NEXT: vptt.u32 hi, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vaddvat.u32 r2, q2
; CHECK-NEXT: le lr, .LBB0_1
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 74df7be81241d..f63477e038264 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -4,53 +4,59 @@
define <2 x i64> @v2i64(i32 %index, i32 %TC, <2 x i64> %V1, <2 x i64> %V2) {
; CHECK-LABEL: v2i64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r0
-; CHECK-NEXT: vmov.i64 q1, #0xffffffff
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vmov q1[2], q1[0], r0, r0
+; CHECK-NEXT: vmov.i64 q0, #0xffffffff
+; CHECK-NEXT: vand q1, q1, q0
+; CHECK-NEXT: movs r5, #0
+; CHECK-NEXT: vmov r0, r4, d3
; CHECK-NEXT: vmov q2[2], q2[0], r1, r1
-; CHECK-NEXT: vmov r0, r12, d1
-; CHECK-NEXT: vmov lr, s0
-; CHECK-NEXT: adds r0, #1
-; CHECK-NEXT: vmov q0[2], q0[0], lr, r0
-; CHECK-NEXT: adc r12, r12, #0
-; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vand q1, q2, q1
-; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vldr d1, [sp, #16]
-; CHECK-NEXT: eors r0, r4
-; CHECK-NEXT: orrs.w r0, r0, r12
-; CHECK-NEXT: vmov r1, r0, d3
-; CHECK-NEXT: cset r12, eq
-; CHECK-NEXT: subs r1, r4, r1
-; CHECK-NEXT: sbcs.w r0, r5, r0
-; CHECK-NEXT: vmov r1, r5, d0
-; CHECK-NEXT: cset r0, lo
-; CHECK-NEXT: vmov d0, r2, r3
-; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: cset r0, ne
-; CHECK-NEXT: and.w r0, r0, r12
-; CHECK-NEXT: rsb.w r12, r0, #0
-; CHECK-NEXT: vmov r4, r0, d2
-; CHECK-NEXT: subs r4, r1, r4
-; CHECK-NEXT: sbcs.w r0, r5, r0
+; CHECK-NEXT: vmov lr, r12, d2
+; CHECK-NEXT: adds r6, r0, #1
+; CHECK-NEXT: adc r4, r4, #0
+; CHECK-NEXT: subs.w r0, lr, #-1
+; CHECK-NEXT: sbcs r0, r12, #0
+; CHECK-NEXT: vmov q1[2], q1[0], lr, r6
; CHECK-NEXT: cset r0, lo
+; CHECK-NEXT: vmov q1[3], q1[1], r12, r4
; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: cset r0, ne
-; CHECK-NEXT: teq.w r1, lr
-; CHECK-NEXT: cset r1, eq
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: rsbs r0, r0, #0
-; CHECK-NEXT: bfi r1, r0, #0, #8
+; CHECK-NEXT: mov.w r0, #0
+; CHECK-NEXT: csetm r12, ne
+; CHECK-NEXT: subs.w r6, r6, #-1
+; CHECK-NEXT: sbcs r6, r4, #0
+; CHECK-NEXT: bfi r5, r12, #0, #8
+; CHECK-NEXT: cset r6, lo
+; CHECK-NEXT: cmp r6, #0
+; CHECK-NEXT: csetm r6, ne
+; CHECK-NEXT: bfi r5, r6, #8, #8
+; CHECK-NEXT: vmsr p0, r5
+; CHECK-NEXT: vpsel q1, q1, q0
+; CHECK-NEXT: vand q0, q2, q0
+; CHECK-NEXT: vmov r1, r4, d0
+; CHECK-NEXT: vmov r6, r5, d2
+; CHECK-NEXT: vmov d0, r2, r3
+; CHECK-NEXT: subs r1, r6, r1
+; CHECK-NEXT: sbcs.w r1, r5, r4
+; CHECK-NEXT: vmov r5, r4, d1
+; CHECK-NEXT: cset r1, lo
+; CHECK-NEXT: vldr d1, [sp, #16]
+; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: csetm r1, ne
+; CHECK-NEXT: bfi r0, r1, #0, #8
+; CHECK-NEXT: vmov r1, r6, d3
+; CHECK-NEXT: subs r1, r1, r5
+; CHECK-NEXT: sbcs.w r1, r6, r4
+; CHECK-NEXT: cset r1, lo
+; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: csetm r1, ne
+; CHECK-NEXT: bfi r0, r1, #8, #8
+; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: add r0, sp, #24
-; CHECK-NEXT: bfi r1, r12, #8, #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
%select = select <2 x i1> %active.lane.mask, <2 x i64> %V1, <2 x i64> %V2
ret <2 x i64> %select
@@ -60,15 +66,11 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) {
; CHECK-LABEL: v4i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: adr.w r12, .LCPI1_0
-; CHECK-NEXT: vdup.32 q1, r0
+; CHECK-NEXT: vdup.32 q1, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vqadd.u32 q0, q0, r0
; CHECK-NEXT: add r0, sp, #8
; CHECK-NEXT: vcmp.u32 hi, q1, q0
-; CHECK-NEXT: vdup.32 q1, r1
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.u32 hi, q1, q0
; CHECK-NEXT: vldr d1, [sp]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov d0, r2, r3
@@ -91,41 +93,31 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) {
define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
; CHECK-LABEL: v7i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: adr r3, .LCPI2_0
-; CHECK-NEXT: vdup.32 q1, r1
-; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: ldr.w r12, [sp, #40]
+; CHECK-NEXT: vdup.32 q3, r2
; CHECK-NEXT: ldr r3, [sp, #32]
-; CHECK-NEXT: vadd.i32 q2, q0, r1
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: vcmp.u32 hi, q1, q2
-; CHECK-NEXT: ldr r2, [sp, #40]
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.u32 hi, q0, q2
-; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
-; CHECK-NEXT: ldr r2, [sp, #44]
+; CHECK-NEXT: adr r2, .LCPI2_1
+; CHECK-NEXT: vmov q0[2], q0[0], r3, r12
+; CHECK-NEXT: ldr.w r12, [sp, #44]
; CHECK-NEXT: ldr r3, [sp, #36]
-; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
-; CHECK-NEXT: ldr r2, [sp, #8]
+; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
+; CHECK-NEXT: ldr.w r12, [sp, #8]
; CHECK-NEXT: ldr r3, [sp]
-; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
-; CHECK-NEXT: ldr r2, [sp, #12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: ldr.w r12, [sp, #12]
; CHECK-NEXT: ldr r3, [sp, #4]
-; CHECK-NEXT: vmov q3[3], q3[1], r3, r2
-; CHECK-NEXT: adr r2, .LCPI2_1
-; CHECK-NEXT: vpsel q2, q3, q2
-; CHECK-NEXT: vstrw.32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q2, [r2]
-; CHECK-NEXT: movw r2, #4095
-; CHECK-NEXT: vadd.i32 q2, q2, r1
-; CHECK-NEXT: vcmp.u32 hi, q1, q2
-; CHECK-NEXT: vmrs r1, p0
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: adr r3, .LCPI2_0
+; CHECK-NEXT: vldrw.u32 q2, [r3]
+; CHECK-NEXT: vqadd.u32 q2, q2, r1
+; CHECK-NEXT: vcmp.u32 hi, q3, q2
+; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: ldr r2, [sp, #48]
-; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vqadd.u32 q0, q0, r1
; CHECK-NEXT: ldr r1, [sp, #52]
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.u32 hi, q0, q2
+; CHECK-NEXT: vcmp.u32 hi, q3, q0
; CHECK-NEXT: vmov.32 q0[1], r1
; CHECK-NEXT: ldr r1, [sp, #56]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
@@ -162,15 +154,15 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
; CHECK-LABEL: v8i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr.w r12, .LCPI3_0
-; CHECK-NEXT: vdup.32 q5, r1
+; CHECK-NEXT: vdup.32 q1, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
-; CHECK-NEXT: vmov.i8 q1, #0x0
-; CHECK-NEXT: vmov.i8 q2, #0xff
-; CHECK-NEXT: vadd.i32 q3, q0, r0
-; CHECK-NEXT: vcmp.u32 hi, q5, q3
-; CHECK-NEXT: vpsel q4, q2, q1
+; CHECK-NEXT: vmov.i8 q2, #0x0
+; CHECK-NEXT: vmov.i8 q3, #0xff
+; CHECK-NEXT: vqadd.u32 q0, q0, r0
+; CHECK-NEXT: vcmp.u32 hi, q1, q0
+; CHECK-NEXT: vpsel q4, q3, q2
; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q0[0], r1
; CHECK-NEXT: vmov.16 q0[1], r12
@@ -179,44 +171,24 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
; CHECK-NEXT: adr r1, .LCPI3_1
; CHECK-NEXT: vldrw.u32 q4, [r1]
; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vadd.i32 q4, q4, r0
-; CHECK-NEXT: vcmp.u32 hi, q5, q4
-; CHECK-NEXT: vpsel q5, q2, q1
-; CHECK-NEXT: vmov r1, r12, d10
-; CHECK-NEXT: vmov.16 q0[4], r1
-; CHECK-NEXT: vmov.16 q0[5], r12
-; CHECK-NEXT: vmov r1, r12, d11
-; CHECK-NEXT: vdup.32 q5, r0
-; CHECK-NEXT: vmov.16 q0[6], r1
-; CHECK-NEXT: vcmp.u32 hi, q5, q3
-; CHECK-NEXT: vmov.16 q0[7], r12
-; CHECK-NEXT: vpsel q6, q2, q1
-; CHECK-NEXT: vcmp.u32 hi, q5, q4
-; CHECK-NEXT: vmov r0, r1, d12
-; CHECK-NEXT: vpsel q1, q2, q1
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.16 q3[1], r1
-; CHECK-NEXT: vmov r0, r1, d13
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.16 q3[3], r1
+; CHECK-NEXT: vqadd.u32 q4, q4, r0
+; CHECK-NEXT: vcmp.u32 hi, q1, q4
+; CHECK-NEXT: vpsel q1, q3, q2
; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.16 q3[5], r1
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.16 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: add r0, sp, #56
-; CHECK-NEXT: vmov.16 q3[7], r1
+; CHECK-NEXT: vmov.16 q0[6], r0
+; CHECK-NEXT: add r0, sp, #24
+; CHECK-NEXT: vmov.16 q0[7], r1
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vcmp.i16 ne, q3, zr
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.i16 ne, q0, zr
-; CHECK-NEXT: vldr d1, [sp, #48]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vldr d1, [sp, #16]
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
@@ -238,175 +210,99 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
; CHECK-LABEL: v16i8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: adr.w r12, .LCPI4_0
-; CHECK-NEXT: vdup.32 q7, r1
+; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
-; CHECK-NEXT: vmov.i8 q5, #0x0
-; CHECK-NEXT: vmov.i8 q4, #0xff
-; CHECK-NEXT: vadd.i32 q1, q0, r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q1
-; CHECK-NEXT: vpsel q0, q4, q5
-; CHECK-NEXT: vmov r1, r12, d0
+; CHECK-NEXT: vmov.i8 q1, #0xff
+; CHECK-NEXT: vqadd.u32 q0, q0, r0
+; CHECK-NEXT: vcmp.u32 hi, q3, q0
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q4, q1, q0
+; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q2[0], r1
; CHECK-NEXT: vmov.16 q2[1], r12
-; CHECK-NEXT: vmov r1, r12, d1
+; CHECK-NEXT: vmov r1, r12, d9
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: adr r1, .LCPI4_1
-; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q4, [r1]
; CHECK-NEXT: vmov.16 q2[3], r12
-; CHECK-NEXT: vadd.i32 q3, q0, r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q3
-; CHECK-NEXT: vpsel q0, q4, q5
-; CHECK-NEXT: vmov r1, r12, d0
+; CHECK-NEXT: vqadd.u32 q4, q4, r0
+; CHECK-NEXT: vcmp.u32 hi, q3, q4
+; CHECK-NEXT: vpsel q4, q1, q0
+; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q2[4], r1
; CHECK-NEXT: vmov.16 q2[5], r12
-; CHECK-NEXT: vmov r1, r12, d1
+; CHECK-NEXT: vmov r1, r12, d9
; CHECK-NEXT: vmov.16 q2[6], r1
; CHECK-NEXT: vmov.16 q2[7], r12
; CHECK-NEXT: vcmp.i16 ne, q2, zr
-; CHECK-NEXT: vpsel q0, q4, q5
-; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vpsel q4, q1, q0
+; CHECK-NEXT: vmov.u16 r1, q4[0]
; CHECK-NEXT: vmov.8 q2[0], r1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: vmov.u16 r1, q4[1]
; CHECK-NEXT: vmov.8 q2[1], r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r1, q4[2]
; CHECK-NEXT: vmov.8 q2[2], r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: vmov.u16 r1, q4[3]
; CHECK-NEXT: vmov.8 q2[3], r1
-; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: vmov.u16 r1, q4[4]
; CHECK-NEXT: vmov.8 q2[4], r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: vmov.u16 r1, q4[5]
; CHECK-NEXT: vmov.8 q2[5], r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: vmov.u16 r1, q4[6]
; CHECK-NEXT: vmov.8 q2[6], r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q4[7]
; CHECK-NEXT: vmov.8 q2[7], r1
; CHECK-NEXT: adr r1, .LCPI4_2
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vpsel q6, q4, q5
-; CHECK-NEXT: vmov r1, r12, d12
-; CHECK-NEXT: vmov.16 q0[0], r1
-; CHECK-NEXT: vmov.16 q0[1], r12
-; CHECK-NEXT: vmov r1, r12, d13
-; CHECK-NEXT: vmov.16 q0[2], r1
+; CHECK-NEXT: vldrw.u32 q4, [r1]
+; CHECK-NEXT: vqadd.u32 q4, q4, r0
+; CHECK-NEXT: vcmp.u32 hi, q3, q4
+; CHECK-NEXT: vpsel q5, q1, q0
+; CHECK-NEXT: vmov r1, r12, d10
+; CHECK-NEXT: vmov.16 q4[0], r1
+; CHECK-NEXT: vmov.16 q4[1], r12
+; CHECK-NEXT: vmov r1, r12, d11
+; CHECK-NEXT: vmov.16 q4[2], r1
; CHECK-NEXT: adr r1, .LCPI4_3
-; CHECK-NEXT: vldrw.u32 q6, [r1]
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vadd.i32 q6, q6, r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q6
-; CHECK-NEXT: vpsel q7, q4, q5
-; CHECK-NEXT: vmov r1, r12, d14
-; CHECK-NEXT: vmov.16 q0[4], r1
-; CHECK-NEXT: vmov.16 q0[5], r12
-; CHECK-NEXT: vmov r1, r12, d15
-; CHECK-NEXT: vmov.16 q0[6], r1
-; CHECK-NEXT: vdup.32 q7, r0
-; CHECK-NEXT: vmov.16 q0[7], r12
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
-; CHECK-NEXT: vpsel q0, q4, q5
-; CHECK-NEXT: vcmp.u32 hi, q7, q1
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: vpsel q1, q4, q5
-; CHECK-NEXT: vmov.8 q2[8], r1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.8 q2[9], r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: vmov.8 q2[10], r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: vmov.8 q2[11], r1
-; CHECK-NEXT: vmov.u16 r1, q0[4]
-; CHECK-NEXT: vmov.8 q2[12], r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: vmov.8 q2[13], r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: vmov.8 q2[14], r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
-; CHECK-NEXT: vmov.8 q2[15], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q3
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vpsel q1, q4, q5
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.16 q0[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
-; CHECK-NEXT: vpsel q0, q4, q5
-; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.8 q3[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.8 q3[1], r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.8 q3[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.8 q3[3], r0
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.8 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.8 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.8 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q3[7], r0
-; CHECK-NEXT: vcmp.u32 hi, q7, q0
-; CHECK-NEXT: vpsel q1, q4, q5
-; CHECK-NEXT: vcmp.u32 hi, q7, q6
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vpsel q1, q4, q5
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.16 q0[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
-; CHECK-NEXT: vpsel q0, q4, q5
+; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: vmov.16 q4[3], r12
+; CHECK-NEXT: vqadd.u32 q5, q5, r0
+; CHECK-NEXT: vcmp.u32 hi, q3, q5
+; CHECK-NEXT: vpsel q3, q1, q0
+; CHECK-NEXT: vmov r0, r1, d6
+; CHECK-NEXT: vmov.16 q4[4], r0
+; CHECK-NEXT: vmov.16 q4[5], r1
+; CHECK-NEXT: vmov r0, r1, d7
+; CHECK-NEXT: vmov.16 q4[6], r0
+; CHECK-NEXT: vmov.16 q4[7], r1
+; CHECK-NEXT: vcmp.i16 ne, q4, zr
+; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.8 q3[8], r0
+; CHECK-NEXT: vmov.8 q2[8], r0
; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.8 q3[9], r0
+; CHECK-NEXT: vmov.8 q2[9], r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.8 q3[10], r0
+; CHECK-NEXT: vmov.8 q2[10], r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.8 q3[11], r0
+; CHECK-NEXT: vmov.8 q2[11], r0
; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.8 q3[12], r0
+; CHECK-NEXT: vmov.8 q2[12], r0
; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.8 q3[13], r0
+; CHECK-NEXT: vmov.8 q2[13], r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.8 q3[14], r0
+; CHECK-NEXT: vmov.8 q2[14], r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.8 q3[15], r0
-; CHECK-NEXT: add r0, sp, #88
-; CHECK-NEXT: vcmp.i8 ne, q3, zr
-; CHECK-NEXT: vldr d1, [sp, #80]
-; CHECK-NEXT: vpnot
+; CHECK-NEXT: vmov.8 q2[15], r0
+; CHECK-NEXT: add r0, sp, #40
+; CHECK-NEXT: vldr d1, [sp, #32]
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.i8 ne, q2, zr
+; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 98f97672efbbb..f0e89431af46f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -366,23 +366,23 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ implicit-def: $r8
; CHECK-NEXT: @ implicit-def: $r5
; CHECK-NEXT: @ implicit-def: $r10
-; CHECK-NEXT: strd r3, r0, [sp] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r3, r0, [sp, #16] @ 8-byte Folded Spill
; CHECK-NEXT: add.w r6, r7, r2, lsr #1
; CHECK-NEXT: add.w r1, r1, r2, lsr #1
; CHECK-NEXT: movw r2, #65532
; CHECK-NEXT: vdup.32 q6, r6
; CHECK-NEXT: movt r2, #32767
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w r1, r7, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: adr r1, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r1, .LCPI1_1
; CHECK-NEXT: vldrw.u32 q5, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r12
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q3, q0, r12
+; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
@@ -404,7 +404,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: subs r1, r2, r1
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: add.w r10, r0, #7
-; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
; CHECK-NEXT: adds r5, #2
@@ -431,7 +431,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: mov r7, r12
; CHECK-NEXT: bl __aeabi_ldivmod
-; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload
+; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
; CHECK-NEXT: mov r12, r7
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: mov r7, r10
@@ -454,13 +455,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ %bb.9: @ %for.body13.us51.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: movw r2, :lower16:a
-; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: movt r2, :upper16:a
; CHECK-NEXT: str r1, [r2]
; CHECK-NEXT: movw r2, :lower16:b
; CHECK-NEXT: movt r2, :upper16:b
; CHECK-NEXT: str r1, [r2]
-; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: dlstp.32 lr, r6
; CHECK-NEXT: .LBB1_10: @ %vector.body111
; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1
@@ -474,21 +475,17 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: b .LBB1_13
; CHECK-NEXT: .LBB1_11: @ %vector.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: .LBB1_12: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1
; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q2, q5, r1
-; CHECK-NEXT: vdup.32 q3, r1
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
+; CHECK-NEXT: vqadd.u32 q2, q5, r1
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: add.w r1, r1, #4
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.u32 hi, q6, q2
+; CHECK-NEXT: vcmp.u32 hi, q6, q2
; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: add.w r1, r1, #4
; CHECK-NEXT: vadd.i32 q2, q2, r11
; CHECK-NEXT: vadd.i32 q1, q1, q7
; CHECK-NEXT: vpst
@@ -509,7 +506,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: beq.w .LBB1_2
; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload
; CHECK-NEXT: mov r2, r10
; CHECK-NEXT: .LBB1_17: @ %for.body6.us60
; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1
More information about the llvm-commits
mailing list