[llvm] [AArch64] Override isLSRCostLess, take number of instructions into account (PR #84189)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 6 07:56:33 PST 2024
https://github.com/huntergr-arm created https://github.com/llvm/llvm-project/pull/84189
Adds an AArch64-specific version of isLSRCostLess, changing the relative importance of the various terms from the formulae being evaluated.
This has been split out from my vscale-aware LSR work, see the RFC for reference: https://discourse.llvm.org/t/rfc-vscale-aware-loopstrengthreduce/77131
I intend to do some benchmarking of this independently of the LSR work to check that there's no major regressions.
>From c5c25680a2ef39e18711b2f5aaeb351b20875fca Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 6 Mar 2024 14:35:10 +0000
Subject: [PATCH] [AArch64] Override isLSRCostLess, take number of instructions
into account
---
.../AArch64/AArch64TargetTransformInfo.cpp | 19 ++
.../AArch64/AArch64TargetTransformInfo.h | 3 +
.../AArch64/arm64-2011-10-18-LdStOptBug.ll | 2 +-
.../test/CodeGen/AArch64/arm64-ldp-cluster.ll | 12 +-
...rleaving-reductions-predicated-scalable.ll | 92 ++++-----
...plex-deinterleaving-reductions-scalable.ll | 126 +++++-------
.../complex-deinterleaving-reductions.ll | 17 +-
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 193 +++++++++---------
.../LoopStrengthReduce/AArch64/lsr-reuse.ll | 6 +-
9 files changed, 237 insertions(+), 233 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 755b034764ed2d..9ed98267e35fcc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
+ cl::init(true), cl::Hidden);
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -4152,3 +4155,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
return true;
return BaseT::shouldTreatInstructionLikeSelect(I);
}
+
+bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2) {
+ // AArch64 specific here is adding the number of instructions to the
+ // comparison (though not as the first consideration, as some targets do)
+ // along with changing the priority of the base additions.
+ // TODO: Maybe a more nuanced tradeoff between instruction count
+ // and number of registers? To be investigated at a later date.
+ if (EnableLSRCostOpt)
+ return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
+ C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
+ C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+
+ return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e1..f438cf7f615920 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -424,6 +424,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
}
std::optional<unsigned> getMinPageSize() const { return 4096; }
+
+ bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
+ const TargetTransformInfo::LSRCost &C2);
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
index 3b6c4fa875e604..dafdcf82f311d4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -12,7 +12,7 @@ entry:
for.body:
; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
; CHECK: add x[[REG:[0-9]+]],
; CHECK: x[[REG]], #1, lsl #12
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 8c7b31fd34c488..114203e46f196b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -176,13 +176,13 @@ exit:
; CHECK: ********** MI Scheduling **********
; CHECK: LDURDi_LDRDui:%bb.1 vector_body
;
-; CHECK: Cluster ld/st SU(2) - SU(6)
-; CHECK: Cluster ld/st SU(3) - SU(7)
+; CHECK: Cluster ld/st SU(0) - SU(4)
+; CHECK: Cluster ld/st SU(1) - SU(5)
;
-; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
-; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
-; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
-; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
+; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui
+; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui
;
define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
entry:
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 467c3c254fc2d3..cb219bf28c5109 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -14,31 +14,29 @@ target triple = "aarch64"
define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NEXT: cntd x10
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: rdvl x11, #2
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: rdvl x10, #2
+; CHECK-NEXT: mov x11, x9
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: zip2 p3.d, p1.d, p1.d
-; CHECK-NEXT: add x13, x0, x8
-; CHECK-NEXT: add x14, x1, x8
-; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
-; CHECK-NEXT: whilelo p1.d, x12, x9
-; CHECK-NEXT: add x8, x8, x11
-; CHECK-NEXT: add x12, x12, x10
-; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
+; CHECK-NEXT: whilelo p1.d, x11, x8
+; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z4.d }, p3/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0]
+; CHECK-NEXT: ld1d { z5.d }, p2/z, [x1]
+; CHECK-NEXT: add x1, x1, x10
+; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
@@ -115,32 +113,30 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: cntd x10
-; CHECK-NEXT: neg x11, x10
-; CHECK-NEXT: mov w12, #100 // =0x64
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: neg x10, x9
+; CHECK-NEXT: mov w11, #100 // =0x64
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: and x11, x11, x12
-; CHECK-NEXT: rdvl x12, #2
+; CHECK-NEXT: and x10, x10, x11
+; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
-; CHECK-NEXT: add x13, x0, x8
-; CHECK-NEXT: add x14, x1, x8
+; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2]
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
-; CHECK-NEXT: add x9, x9, x10
-; CHECK-NEXT: add x8, x8, x12
+; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0
-; CHECK-NEXT: cmp x11, x9
+; CHECK-NEXT: cmp x10, x8
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
-; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
-; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14]
+; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
+; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
+; CHECK-NEXT: add x1, x1, x11
+; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
@@ -217,33 +213,33 @@ exit.block: ; preds = %vector.body
define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) {
; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w10, #100 // =0x64
+; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: whilelo p1.d, xzr, x10
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: cntd x11
-; CHECK-NEXT: rdvl x12, #2
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: rdvl x10, #2
+; CHECK-NEXT: cnth x11
+; CHECK-NEXT: mov x12, x9
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
-; CHECK-NEXT: add x13, x0, x8
-; CHECK-NEXT: add x14, x1, x8
+; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
-; CHECK-NEXT: add x9, x9, x11
-; CHECK-NEXT: add x8, x8, x12
+; CHECK-NEXT: add x2, x2, x11
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: zip2 p3.d, p1.d, p1.d
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
-; CHECK-NEXT: whilelo p1.d, x9, x10
-; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT: whilelo p1.d, x12, x8
+; CHECK-NEXT: add x12, x12, x9
+; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z4.d }, p3/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0]
+; CHECK-NEXT: ld1d { z5.d }, p2/z, [x1]
+; CHECK-NEXT: add x1, x1, x10
+; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 1696ac8709d406..933b5f05975106 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -15,30 +15,27 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: neg x9, x9
-; CHECK-NEXT: mov w10, #100 // =0x64
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: and x10, x9, x10
-; CHECK-NEXT: rdvl x11, #2
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: rdvl x10, #2
+; CHECK-NEXT: and x9, x8, x9
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x12, x0, x8
-; CHECK-NEXT: add x13, x1, x8
-; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
-; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
-; CHECK-NEXT: add x8, x8, x11
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: adds x9, x9, x8
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: add x1, x1, x10
+; CHECK-NEXT: add x0, x0, x10
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -105,13 +102,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: fmov d0, #1.00000000
; CHECK-NEXT: mov z1.d, #0 // =0x0
; CHECK-NEXT: fmov d2, #2.00000000
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: mov w10, #100 // =0x64
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: neg x9, x9
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: and x10, x9, x10
-; CHECK-NEXT: rdvl x11, #2
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: rdvl x10, #2
+; CHECK-NEXT: and x9, x8, x9
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
; CHECK-NEXT: mov z1.d, p0/m, z2.d
; CHECK-NEXT: ptrue p0.d
@@ -119,18 +114,17 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x12, x0, x8
-; CHECK-NEXT: add x13, x1, x8
-; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
-; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
-; CHECK-NEXT: add x8, x8, x11
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: adds x9, x9, x8
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: add x1, x1, x10
+; CHECK-NEXT: add x0, x0, x10
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -190,45 +184,37 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64_unrolled:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z1.d, #0 // =0x0
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cntw x9
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: neg x9, x9
-; CHECK-NEXT: mov w10, #1000 // =0x3e8
-; CHECK-NEXT: rdvl x12, #2
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: and x10, x9, x10
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: mov w9, #1000 // =0x3e8
+; CHECK-NEXT: rdvl x10, #4
+; CHECK-NEXT: and x9, x8, x9
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT: add x11, x1, x12
-; CHECK-NEXT: add x12, x0, x12
-; CHECK-NEXT: rdvl x13, #4
; CHECK-NEXT: mov z2.d, z1.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x14, x0, x8
-; CHECK-NEXT: add x15, x12, x8
-; CHECK-NEXT: add x16, x1, x8
-; CHECK-NEXT: add x17, x11, x8
-; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
-; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
-; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8]
-; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl]
-; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl]
-; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
-; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
-; CHECK-NEXT: add x8, x8, x13
-; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
-; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
-; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
-; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
-; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90
-; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
+; CHECK-NEXT: adds x9, x9, x8
+; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
+; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: add x0, x0, x10
+; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: add x1, x1, x10
+; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
+; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0
+; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90
+; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90
+; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index 44d0a9392ba629..aed3072bb4af37 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -148,17 +148,16 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add x8, x0, #32
+; CHECK-NEXT: add x9, x1, #32
+; CHECK-NEXT: mov x10, #-100 // =0xffffffffffffff9c
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: ldp q5, q4, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q7, q6, [x10]
-; CHECK-NEXT: ldp q17, q16, [x9, #32]
-; CHECK-NEXT: ldp q19, q18, [x10, #32]
+; CHECK-NEXT: ldp q5, q4, [x8, #-32]
+; CHECK-NEXT: adds x10, x10, #4
+; CHECK-NEXT: ldp q7, q6, [x9, #-32]
+; CHECK-NEXT: ldp q17, q16, [x8], #64
+; CHECK-NEXT: ldp q19, q18, [x9], #64
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0
; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0
; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 08ad34c7b03ba0..54d7ecfaa8caf3 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1669,42 +1669,41 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
; CHECK-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh18:
-; CHECK-NEXT: adrp x9, lCPI17_0 at PAGE
+; CHECK-NEXT: adrp x8, lCPI17_0 at PAGE
; CHECK-NEXT: Lloh19:
-; CHECK-NEXT: adrp x10, lCPI17_1 at PAGE
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: adrp x9, lCPI17_1 at PAGE
+; CHECK-NEXT: mov w10, #128 ; =0x80
; CHECK-NEXT: Lloh20:
-; CHECK-NEXT: ldr q0, [x9, lCPI17_0 at PAGEOFF]
+; CHECK-NEXT: ldr q0, [x8, lCPI17_0 at PAGEOFF]
; CHECK-NEXT: Lloh21:
-; CHECK-NEXT: ldr q1, [x10, lCPI17_1 at PAGEOFF]
+; CHECK-NEXT: ldr q1, [x9, lCPI17_1 at PAGEOFF]
+; CHECK-NEXT: add x8, x1, #64
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: LBB17_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp d2, d3, [x9, #-8]
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: ldp q6, q5, [x10, #32]
-; CHECK-NEXT: add x8, x8, #128
-; CHECK-NEXT: ldp q17, q16, [x10]
-; CHECK-NEXT: cmp x8, #1024
+; CHECK-NEXT: subs x10, x10, #16
+; CHECK-NEXT: ldp q6, q5, [x8, #-32]
+; CHECK-NEXT: add x9, x9, #16
+; CHECK-NEXT: ldp q17, q16, [x8, #-64]
; CHECK-NEXT: tbl.16b v4, { v2 }, v1
; CHECK-NEXT: tbl.16b v2, { v2 }, v0
; CHECK-NEXT: tbl.16b v7, { v3 }, v1
; CHECK-NEXT: tbl.16b v3, { v3 }, v0
-; CHECK-NEXT: add x9, x9, #16
; CHECK-NEXT: uaddw2.2d v5, v5, v4
; CHECK-NEXT: uaddw.2d v4, v6, v4
; CHECK-NEXT: uaddw2.2d v6, v16, v2
-; CHECK-NEXT: ldp q18, q16, [x10, #96]
+; CHECK-NEXT: ldp q18, q16, [x8, #32]
; CHECK-NEXT: uaddw.2d v2, v17, v2
-; CHECK-NEXT: stp q4, q5, [x10, #32]
-; CHECK-NEXT: ldp q17, q5, [x10, #64]
+; CHECK-NEXT: stp q4, q5, [x8, #-32]
+; CHECK-NEXT: ldp q17, q5, [x8]
; CHECK-NEXT: uaddw2.2d v16, v16, v7
; CHECK-NEXT: uaddw.2d v7, v18, v7
-; CHECK-NEXT: stp q2, q6, [x10]
+; CHECK-NEXT: stp q2, q6, [x8, #-64]
; CHECK-NEXT: uaddw2.2d v4, v5, v3
; CHECK-NEXT: uaddw.2d v3, v17, v3
-; CHECK-NEXT: stp q7, q16, [x10, #96]
-; CHECK-NEXT: stp q3, q4, [x10, #64]
+; CHECK-NEXT: stp q7, q16, [x8, #32]
+; CHECK-NEXT: stp q3, q4, [x8], #128
; CHECK-NEXT: b.ne LBB17_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -1715,67 +1714,67 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: adrp x9, .LCPI17_0
; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI17_0
-; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: mov w8, #128 // =0x80
; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT: adrp x9, .LCPI17_1
; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI17_1
; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT: add x9, x0, #8
+; CHECK-BE-NEXT: add x9, x1, #64
+; CHECK-BE-NEXT: add x10, x0, #8
; CHECK-BE-NEXT: .LBB17_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: sub x10, x9, #8
-; CHECK-BE-NEXT: ld1 { v2.8b }, [x9]
-; CHECK-BE-NEXT: add x9, x9, #16
-; CHECK-BE-NEXT: ld1 { v3.8b }, [x10]
-; CHECK-BE-NEXT: add x10, x1, x8
-; CHECK-BE-NEXT: add x8, x8, #128
-; CHECK-BE-NEXT: add x15, x10, #96
-; CHECK-BE-NEXT: add x11, x10, #32
-; CHECK-BE-NEXT: add x14, x10, #64
+; CHECK-BE-NEXT: sub x11, x10, #8
+; CHECK-BE-NEXT: ld1 { v2.8b }, [x10]
+; CHECK-BE-NEXT: add x15, x9, #32
+; CHECK-BE-NEXT: ld1 { v3.8b }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #64
+; CHECK-BE-NEXT: sub x12, x9, #32
+; CHECK-BE-NEXT: ld1 { v6.2d }, [x9]
+; CHECK-BE-NEXT: ld1 { v16.2d }, [x15]
+; CHECK-BE-NEXT: ld1 { v20.2d }, [x12]
; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b
; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b
-; CHECK-BE-NEXT: ld1 { v16.2d }, [x15]
+; CHECK-BE-NEXT: ld1 { v21.2d }, [x11]
; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v1.16b
; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v0.16b
-; CHECK-BE-NEXT: ld1 { v6.2d }, [x10]
-; CHECK-BE-NEXT: ld1 { v19.2d }, [x14]
-; CHECK-BE-NEXT: ld1 { v21.2d }, [x11]
-; CHECK-BE-NEXT: add x12, x10, #48
-; CHECK-BE-NEXT: add x13, x10, #16
-; CHECK-BE-NEXT: add x16, x10, #112
-; CHECK-BE-NEXT: add x17, x10, #80
+; CHECK-BE-NEXT: sub x13, x9, #16
+; CHECK-BE-NEXT: sub x14, x9, #48
+; CHECK-BE-NEXT: add x16, x9, #48
+; CHECK-BE-NEXT: add x17, x9, #16
+; CHECK-BE-NEXT: subs x8, x8, #16
+; CHECK-BE-NEXT: add x10, x10, #16
; CHECK-BE-NEXT: rev32 v7.8b, v4.8b
; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8
; CHECK-BE-NEXT: rev32 v17.8b, v2.8b
; CHECK-BE-NEXT: ext v18.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT: ext v20.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT: ext v19.16b, v3.16b, v3.16b, #8
; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8
; CHECK-BE-NEXT: rev32 v5.8b, v5.8b
; CHECK-BE-NEXT: rev32 v3.8b, v3.8b
-; CHECK-BE-NEXT: cmp x8, #1024
; CHECK-BE-NEXT: rev32 v4.8b, v4.8b
; CHECK-BE-NEXT: uaddw v7.2d, v16.2d, v7.2s
-; CHECK-BE-NEXT: ld1 { v16.2d }, [x16]
-; CHECK-BE-NEXT: rev32 v18.8b, v18.8b
-; CHECK-BE-NEXT: rev32 v20.8b, v20.8b
+; CHECK-BE-NEXT: uaddw v6.2d, v6.2d, v17.2s
+; CHECK-BE-NEXT: rev32 v17.8b, v18.8b
+; CHECK-BE-NEXT: rev32 v19.8b, v19.8b
; CHECK-BE-NEXT: rev32 v2.8b, v2.8b
-; CHECK-BE-NEXT: uaddw v17.2d, v19.2d, v17.2s
-; CHECK-BE-NEXT: ld1 { v19.2d }, [x12]
-; CHECK-BE-NEXT: uaddw v5.2d, v21.2d, v5.2s
-; CHECK-BE-NEXT: ld1 { v21.2d }, [x13]
-; CHECK-BE-NEXT: uaddw v3.2d, v6.2d, v3.2s
-; CHECK-BE-NEXT: ld1 { v6.2d }, [x17]
-; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s
+; CHECK-BE-NEXT: ld1 { v16.2d }, [x16]
+; CHECK-BE-NEXT: ld1 { v18.2d }, [x13]
+; CHECK-BE-NEXT: uaddw v5.2d, v20.2d, v5.2s
+; CHECK-BE-NEXT: ld1 { v20.2d }, [x14]
+; CHECK-BE-NEXT: uaddw v3.2d, v21.2d, v3.2s
+; CHECK-BE-NEXT: ld1 { v21.2d }, [x17]
; CHECK-BE-NEXT: st1 { v7.2d }, [x15]
-; CHECK-BE-NEXT: uaddw v7.2d, v19.2d, v18.2s
-; CHECK-BE-NEXT: uaddw v16.2d, v21.2d, v20.2s
-; CHECK-BE-NEXT: uaddw v2.2d, v6.2d, v2.2s
-; CHECK-BE-NEXT: st1 { v17.2d }, [x14]
-; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
-; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s
+; CHECK-BE-NEXT: st1 { v6.2d }, [x9]
+; CHECK-BE-NEXT: uaddw v6.2d, v18.2d, v17.2s
+; CHECK-BE-NEXT: uaddw v7.2d, v20.2d, v19.2s
+; CHECK-BE-NEXT: uaddw v2.2d, v21.2d, v2.2s
+; CHECK-BE-NEXT: add x9, x9, #128
+; CHECK-BE-NEXT: st1 { v5.2d }, [x12]
+; CHECK-BE-NEXT: st1 { v3.2d }, [x11]
; CHECK-BE-NEXT: st1 { v4.2d }, [x16]
-; CHECK-BE-NEXT: st1 { v7.2d }, [x12]
-; CHECK-BE-NEXT: st1 { v16.2d }, [x13]
+; CHECK-BE-NEXT: st1 { v6.2d }, [x13]
+; CHECK-BE-NEXT: st1 { v7.2d }, [x14]
; CHECK-BE-NEXT: st1 { v2.2d }, [x17]
; CHECK-BE-NEXT: b.ne .LBB17_1
; CHECK-BE-NEXT: // %bb.2: // %exit
@@ -1813,14 +1812,14 @@ exit:
define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #128 ; =0x80
; CHECK-NEXT: add x9, x1, #128
+; CHECK-NEXT: add x10, x0, #16
; CHECK-NEXT: LBB18_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x10, x0, x8
-; CHECK-NEXT: add x8, x8, #16
-; CHECK-NEXT: ldp q0, q1, [x10]
-; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ldp q0, q1, [x10, #-16]
+; CHECK-NEXT: subs x8, x8, #16
+; CHECK-NEXT: add x10, x10, #16
; CHECK-NEXT: ushll2.8h v2, v0, #0
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll2.8h v6, v1, #0
@@ -1863,18 +1862,18 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: mov w8, #128 // =0x80
; CHECK-BE-NEXT: add x9, x1, #128
+; CHECK-BE-NEXT: add x10, x0, #16
; CHECK-BE-NEXT: .LBB18_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: add x10, x0, x8
-; CHECK-BE-NEXT: sub x11, x9, #32
-; CHECK-BE-NEXT: add x8, x8, #16
-; CHECK-BE-NEXT: ld1 { v0.16b }, [x10]
-; CHECK-BE-NEXT: add x10, x10, #16
-; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: sub x11, x10, #16
; CHECK-BE-NEXT: ld1 { v5.16b }, [x10]
-; CHECK-BE-NEXT: sub x10, x9, #16
+; CHECK-BE-NEXT: sub x12, x9, #32
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #16
+; CHECK-BE-NEXT: subs x8, x8, #16
+; CHECK-BE-NEXT: add x10, x10, #16
; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
@@ -1885,54 +1884,54 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT: ushll2 v6.2d, v1.4s, #0
; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v4.2d }, [x11]
; CHECK-BE-NEXT: ushll2 v4.2d, v3.4s, #0
; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0
-; CHECK-BE-NEXT: st1 { v2.2d }, [x11]
+; CHECK-BE-NEXT: st1 { v2.2d }, [x12]
; CHECK-BE-NEXT: ushll2 v2.8h, v5.16b, #0
-; CHECK-BE-NEXT: sub x11, x9, #80
-; CHECK-BE-NEXT: sub x10, x9, #48
-; CHECK-BE-NEXT: st1 { v4.2d }, [x11]
+; CHECK-BE-NEXT: sub x12, x9, #80
+; CHECK-BE-NEXT: sub x11, x9, #48
+; CHECK-BE-NEXT: st1 { v4.2d }, [x12]
; CHECK-BE-NEXT: ushll v4.8h, v5.8b, #0
-; CHECK-BE-NEXT: sub x11, x9, #64
+; CHECK-BE-NEXT: sub x12, x9, #64
; CHECK-BE-NEXT: ushll2 v5.4s, v2.8h, #0
-; CHECK-BE-NEXT: st1 { v1.2d }, [x11]
-; CHECK-BE-NEXT: sub x11, x9, #96
+; CHECK-BE-NEXT: st1 { v1.2d }, [x12]
+; CHECK-BE-NEXT: sub x12, x9, #96
; CHECK-BE-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0
; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-BE-NEXT: st1 { v6.2d }, [x10]
-; CHECK-BE-NEXT: sub x10, x9, #128
-; CHECK-BE-NEXT: st1 { v3.2d }, [x11]
+; CHECK-BE-NEXT: st1 { v6.2d }, [x11]
+; CHECK-BE-NEXT: sub x11, x9, #128
+; CHECK-BE-NEXT: st1 { v3.2d }, [x12]
; CHECK-BE-NEXT: ushll2 v3.4s, v4.8h, #0
; CHECK-BE-NEXT: ushll2 v6.2d, v5.4s, #0
-; CHECK-BE-NEXT: sub x11, x9, #112
+; CHECK-BE-NEXT: sub x12, x9, #112
; CHECK-BE-NEXT: ushll v5.2d, v5.2s, #0
-; CHECK-BE-NEXT: st1 { v0.2d }, [x10]
-; CHECK-BE-NEXT: st1 { v1.2d }, [x11]
+; CHECK-BE-NEXT: st1 { v0.2d }, [x11]
+; CHECK-BE-NEXT: st1 { v1.2d }, [x12]
; CHECK-BE-NEXT: ushll2 v1.2d, v2.4s, #0
-; CHECK-BE-NEXT: add x10, x9, #112
+; CHECK-BE-NEXT: add x11, x9, #112
; CHECK-BE-NEXT: ushll v4.4s, v4.4h, #0
; CHECK-BE-NEXT: ushll2 v0.2d, v3.4s, #0
-; CHECK-BE-NEXT: st1 { v6.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #96
+; CHECK-BE-NEXT: st1 { v6.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #96
; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0
-; CHECK-BE-NEXT: st1 { v5.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #80
-; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #48
+; CHECK-BE-NEXT: st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #80
+; CHECK-BE-NEXT: st1 { v1.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #48
; CHECK-BE-NEXT: ushll2 v1.2d, v4.4s, #0
-; CHECK-BE-NEXT: st1 { v0.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v0.2d }, [x11]
; CHECK-BE-NEXT: ushll v0.2d, v4.2s, #0
-; CHECK-BE-NEXT: add x10, x9, #64
-; CHECK-BE-NEXT: st1 { v2.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #32
-; CHECK-BE-NEXT: st1 { v3.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: add x11, x9, #64
+; CHECK-BE-NEXT: st1 { v2.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #32
+; CHECK-BE-NEXT: st1 { v3.2d }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #16
; CHECK-BE-NEXT: st1 { v0.2d }, [x9]
; CHECK-BE-NEXT: add x9, x9, #128
-; CHECK-BE-NEXT: st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v1.2d }, [x11]
; CHECK-BE-NEXT: b.ne .LBB18_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
index 323e242620e66f..64e8a6be998eaf 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
@@ -2,7 +2,7 @@
declare void @foo(i64)
-; Verify that redundant adds aren't inserted by LSR.
+; Verify that redundant adds or geps aren't inserted by LSR.
; CHECK-LABEL: @bar(
define void @bar(ptr %A) {
entry:
@@ -10,9 +10,11 @@ entry:
while.cond:
; CHECK-LABEL: while.cond:
-; CHECK: add i64 %lsr.iv, 1
; CHECK-NOT: add i64 %lsr.iv, 1
; CHECK-LABEL: land.rhs:
+; CHECK: getelementptr i8, ptr %lsr.iv, i64 -8
+; CHECK-NOT: getelementptr i8, ptr %lsr.iv, i64 -8
+; CHECK-NOT: add i64, %lsr.iv, 1
%indvars.iv28 = phi i64 [ %indvars.iv.next29, %land.rhs ], [ 50, %entry ]
%cmp = icmp sgt i64 %indvars.iv28, 0
br i1 %cmp, label %land.rhs, label %while.end
More information about the llvm-commits
mailing list