[llvm] [LoopStrengthReduce] Encourage the creation of IVs whose increment can later be combined with memory instuctions (PR #152995)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 04:14:23 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sergey Shcherbinin (SergeyShch01)
<details>
<summary>Changes</summary>
Encourage (via heuristics) the creation of IVs whose increment can later be combined with memory instructions as pre/post increments. Regresstion tests are updated accordingly.
---
Patch is 216.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152995.diff
38 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp (+16-4)
- (modified) llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll (+25-8)
- (modified) llvm/test/CodeGen/AArch64/cheap-as-a-move.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll (+3-5)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll (+8-14)
- (modified) llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll (+72-105)
- (modified) llvm/test/CodeGen/AArch64/machine-combiner-copy.ll (+6-7)
- (modified) llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll (+10-9)
- (modified) llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll (+68-83)
- (modified) llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll (+144-142)
- (modified) llvm/test/CodeGen/AArch64/reduce-or-opt.ll (+10-10)
- (modified) llvm/test/CodeGen/AArch64/sink-and-fold.ll (+21-20)
- (modified) llvm/test/CodeGen/AArch64/sink-mul-exts.ll (+18-24)
- (modified) llvm/test/CodeGen/AArch64/sinksplat.ll (+9-12)
- (modified) llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll (+21-27)
- (modified) llvm/test/CodeGen/AArch64/trunc-to-tbl.ll (+159-198)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fadd.ll (+27-36)
- (modified) llvm/test/CodeGen/AArch64/vldn_shuffle.ll (+22-33)
- (modified) llvm/test/CodeGen/AArch64/vselect-ext.ll (+9-12)
- (modified) llvm/test/CodeGen/AArch64/zext-to-tbl.ll (+227-275)
- (modified) llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll (+53-58)
- (modified) llvm/test/CodeGen/ARM/branch-on-zero.ll (+12-18)
- (modified) llvm/test/CodeGen/ARM/dsp-loop-indexing.ll (+55-56)
- (modified) llvm/test/CodeGen/ARM/fpclamptosat.ll (+73-69)
- (modified) llvm/test/CodeGen/ARM/loop-align-cortex-m.ll (+2-2)
- (modified) llvm/test/CodeGen/ARM/loop-indexing.ll (+52-52)
- (modified) llvm/test/CodeGen/Thumb/mvn.ll (+2-97)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll (+3-4)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll (+82-158)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-le-simple.ll (+19-24)
- (modified) llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll (+23-26)
- (modified) llvm/test/CodeGen/Thumb2/pacbti-m-varargs-1.ll (+5-5)
- (modified) llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll (+24-21)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll (+13-14)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll (+7-6)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll (+1-1)
- (modified) llvm/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll (+2-2)
- (modified) llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll (+2-1)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e3ef9d8680b53..27d9190688ffa 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -523,6 +523,8 @@ struct Formula {
bool countsDownToZero() const;
+ bool isBaseRegOnly() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -717,6 +719,11 @@ bool Formula::countsDownToZero() const {
return StepInt->isNegative();
}
+bool Formula::isBaseRegOnly() const {
+ return BaseGV == nullptr && Scale == 0 && ScaledReg == nullptr &&
+ BaseOffset.isZero() && UnfoldedOffset.isZero() && BaseRegs.size() == 1;
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -1425,12 +1432,17 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
const SCEV *Start;
const SCEVConstant *Step;
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
- // If the step size matches the base offset, we could use pre-indexed
- // addressing.
- if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
+ if ( // If the step size matches the base offset, we could use
+ // pre-indexed addressing.
+ (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
(AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L)))
+ SE->isLoopInvariant(Start, L)) ||
+ // general check for post-indexed addressing with specific step
+ (LU.Kind == LSRUse::Address && F.isBaseRegOnly() &&
+ TTI->isLegalAddressingMode(LU.AccessTy.MemTy, nullptr,
+ Step->getAPInt().getSExtValue(), true,
+ 0, LU.AccessTy.AddrSpace)))
LoopCost = 0;
}
// If the loop counts down to zero and we'll be using a hardware loop then
diff --git a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
index c4da564434ee9..966ff15dff098 100644
--- a/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -loop-reduce < %s | FileCheck %s
; Scaling factor in addressing mode are costly.
; Make loop-reduce prefer unscaled accesses.
@@ -7,20 +8,38 @@ target triple = "arm64-apple-ios7.0.0"
; Function Attrs: nounwind ssp
define void @mulDouble(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) {
-; CHECK: @mulDouble
+; CHECK-LABEL: define void @mulDouble(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[C]], i64 16
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[LSR_IV6:%.*]] = phi ptr [ [[SCEVGEP7:%.*]], [[FOR_BODY]] ], [ [[B]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[LSR_IV4:%.*]] = phi ptr [ [[SCEVGEP5:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP3]], [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV2:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 19, [[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP1:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[LSR_IV6]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[LSR_IV4]], align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT: store double [[MUL]], ptr [[LSR_IV]], align 8
+; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV2]], -1
+; CHECK-NEXT: [[SCEVGEP5]] = getelementptr i8, ptr [[LSR_IV4]], i64 8
+; CHECK-NEXT: [[SCEVGEP7]] = getelementptr i8, ptr [[LSR_IV6]], i64 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
-; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
-; Only one induction variable should have been generated.
-; CHECK-NOT: phi
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%tmp = add nsw i64 %indvars.iv, -1
%arrayidx = getelementptr inbounds double, ptr %b, i64 %tmp
%tmp1 = load double, ptr %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
%indvars.iv.next = add i64 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds double, ptr %c, i64 %indvars.iv.next
%tmp2 = load double, ptr %arrayidx2, align 8
@@ -28,8 +47,6 @@ for.body: ; preds = %for.body, %entry
%arrayidx4 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
store double %mul, ptr %arrayidx4, align 8
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
%exitcond = icmp eq i32 %lftr.wideiv, 20
br i1 %exitcond, label %for.end, label %for.body
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index 50c70c5676c4a..673caa2a7e63c 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -26,11 +26,11 @@ define void @f0(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB0_2
; CHECK-NEXT: .LBB0_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x22, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x22, lsl #2]
; CHECK-NEXT: add x22, x22, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x22, x19
; CHECK-NEXT: b.lt .LBB0_1
; CHECK-NEXT: .LBB0_2: // %exit
@@ -76,12 +76,12 @@ define void @f1(ptr %a, i64 %n) {
; CHECK-NEXT: b.ge .LBB1_2
; CHECK-NEXT: .LBB1_1: // %loop.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w0, [x20, x21, lsl #2]
+; CHECK-NEXT: ldr w0, [x20]
; CHECK-NEXT: mov x1, #1450704896 // =0x56780000
; CHECK-NEXT: movk x1, #4660, lsl #48
; CHECK-NEXT: bl g
-; CHECK-NEXT: str w0, [x20, x21, lsl #2]
; CHECK-NEXT: add x21, x21, #1
+; CHECK-NEXT: str w0, [x20], #4
; CHECK-NEXT: cmp x21, x19
; CHECK-NEXT: b.lt .LBB1_1
; CHECK-NEXT: .LBB1_2: // %exit
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 7542e9c4b8f5b..279b5e0a6dd81 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -36,10 +36,9 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: add x8, x0, #16
+; CHECK-NEXT: mov w8, #32 // =0x20
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: mov w9, #32 // =0x20
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: movi v5.2d, #0000000000000000
; CHECK-NEXT: movi v7.2d, #0000000000000000
@@ -47,9 +46,8 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: movi v16.2d, #0000000000000000
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q17, q18, [x8, #-16]
-; CHECK-NEXT: subs x9, x9, #32
-; CHECK-NEXT: add x8, x8, #32
+; CHECK-NEXT: ldp q17, q18, [x0], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: cmeq v17.16b, v17.16b, #0
; CHECK-NEXT: cmeq v18.16b, v18.16b, #0
; CHECK-NEXT: ushll2 v19.8h, v17.16b, #0
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index aed3072bb4af3..78ad7ad81f84d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -16,15 +16,12 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
@@ -83,15 +80,12 @@ define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1600 // =0x640
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8
-; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: add x8, x8, #32
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: cmp x8, #1600
-; CHECK-NEXT: ldp q5, q4, [x10]
+; CHECK-NEXT: ldp q3, q2, [x0], #32
+; CHECK-NEXT: ldp q5, q4, [x1], #32
+; CHECK-NEXT: subs x8, x8, #32
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1fbca7ca2c27c..be20483b75f7e 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -32,13 +32,11 @@ define void @fptoui_v8f32_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI0_0 at PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr q0, [x8, lCPI0_0 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: tbl.16b v1, { v3, v4 }, v0
@@ -111,22 +109,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI2_0 at PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr q0, [x8, lCPI2_0 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB2_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -178,22 +172,18 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, p
; CHECK-NEXT: adrp x8, lCPI3_0 at PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q0, [x8, lCPI3_0 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB3_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q2, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v4, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x1], #32
; CHECK-NEXT: fcvtzu.4s v3, v2
; CHECK-NEXT: fcvtzu.4s v6, v1
; CHECK-NEXT: fcvtzu.4s v5, v7
; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0
-; CHECK-NEXT: str q1, [x2, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q1, [x2], #16
; CHECK-NEXT: b.eq LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -245,15 +235,13 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI4_0 at PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q0, [x8, lCPI4_0 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB4_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: ldp q2, q1, [x9, #32]
+; CHECK-NEXT: ldp q2, q1, [x0, #32]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v6, v1
-; CHECK-NEXT: ldp q7, q1, [x9]
+; CHECK-NEXT: ldp q7, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v5, v2
; CHECK-NEXT: fcvtzu.4s v4, v1
; CHECK-NEXT: fcvtzu.4s v3, v7
@@ -306,30 +294,25 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI5_0 at PAGE
; CHECK-NEXT: Lloh9:
; CHECK-NEXT: ldr q0, [x8, lCPI5_0 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #6
-; CHECK-NEXT: add x10, x1, x9
-; CHECK-NEXT: add x9, x0, x9
-; CHECK-NEXT: ldp q2, q1, [x10, #32]
-; CHECK-NEXT: ldp q3, q4, [x9, #32]
-; CHECK-NEXT: ldp q5, q6, [x10]
+; CHECK-NEXT: ldp q2, q1, [x1, #32]
+; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: ldp q3, q4, [x0, #32]
+; CHECK-NEXT: ldp q5, q6, [x1], #64
; CHECK-NEXT: fcvtzu.4s v19, v1
; CHECK-NEXT: fcvtzu.4s v18, v2
-; CHECK-NEXT: ldp q2, q1, [x9]
; CHECK-NEXT: fcvtzu.4s v23, v4
-; CHECK-NEXT: fcvtzu.4s v17, v6
-; CHECK-NEXT: add x9, x2, x8, lsl #5
+; CHECK-NEXT: ldp q2, q1, [x0], #64
; CHECK-NEXT: fcvtzu.4s v22, v3
+; CHECK-NEXT: fcvtzu.4s v17, v6
; CHECK-NEXT: fcvtzu.4s v16, v5
-; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v21, v1
-; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: fcvtzu.4s v20, v2
; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0
; CHECK-NEXT: tbl.16b v2, { v20, v21, v22, v23 }, v0
-; CHECK-NEXT: stp q2, q1, [x9]
+; CHECK-NEXT: stp q2, q1, [x2], #32
; CHECK-NEXT: b.eq LBB5_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -359,17 +342,15 @@ exit:
define void @fptoui_v8f32_to_v8i16_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: fptoui_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: ldp q0, q1, [x9]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: uzp1.8h v0, v0, v1
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: str q0, [x1], #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -394,24 +375,19 @@ exit:
define void @fptoui_2x_v8f32_to_v8i16_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-LABEL: fptoui_2x_v8f32_to_v8i16_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB7_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsl x9, x8, #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: add x10, x0, x9
-; CHECK-NEXT: add x11, x1, x9
-; CHECK-NEXT: add x9, x2, x9
-; CHECK-NEXT: ldp q0, q1, [x10]
-; CHECK-NEXT: ldp q2, q3, [x11]
+; CHECK-NEXT: ldp q0, q1, [x0], #32
+; CHECK-NEXT: ldp q2, q3, [x1], #32
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: fcvtzu.4s v1, v1
; CHECK-NEXT: fcvtzu.4s v0, v0
; CHECK-NEXT: fcvtzu.4s v3, v3
; CHECK-NEXT: fcvtzu.4s v2, v2
; CHECK-NEXT: uzp1.8h v0, v0, v1
; CHECK-NEXT: uzp1.8h v1, v2, v3
-; CHECK-NEXT: stp q0, q1, [x9]
+; CHECK-NEXT: stp q0, q1, [x2], #32
; CHECK-NEXT: b.eq LBB7_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -483,18 +459,16 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q0, [x8, lCPI8_0 at PAGEOFF]
; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q1, [x9, lCPI8_1 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB8_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
-; CHECK-NEXT: add x9, x1, x8, lsl #5
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr d2, [x0], #8
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v3, { v2 }, v0
; CHECK-NEXT: tbl.16b v2, { v2 }, v1
; CHECK-NEXT: ucvtf.4s v3, v3
; CHECK-NEXT: ucvtf.4s v2, v2
-; CHECK-NEXT: stp q2, q3, [x9]
+; CHECK-NEXT: stp q2, q3, [x1], #32
; CHECK-NEXT: b.eq LBB8_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -606,13 +580,11 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q2, [x10, lCPI9_2 at PAGEOFF]
; CHECK-NEXT: Lloh21:
; CHECK-NEXT: ldr q3, [x8, lCPI9_3 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1000 ; =0x3e8
; CHECK-NEXT: LBB9_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
-; CHECK-NEXT: add x9, x1, x8, lsl #6
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: ldr q4, [x0], #16
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.16b v6, { v4 }, v1
; CHECK-NEXT: tbl.16b v7, { v4 }, v2
@@ -621,8 +593,8 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: ucvtf.4s v6, v6
; CHECK-NEXT: ucvtf.4s v7, v7
; CHECK-NEXT: ucvtf.4s v4, v4
-; CHECK-NEXT: stp q6, q5, [x9, #32]
-; CHECK-NEXT: stp q4, q7, [x9]
+; CHECK-NEXT: stp q6, q5, [x1, #32]
+; CHECK-NEXT: stp q4, q7, [x1], #64
; CHECK-NEXT: b.eq LBB9_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -668,13 +640,11 @@ define void @uitofp_v8i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapt
; CHECK-NEXT: ldr q2, [x10, lCPI10_2 at PAGEOFF]
; CHECK-NEXT: Lloh29:
; CHECK-NEXT: ldr q3, [x8, lCPI10_3 at PAGEOFF]
-; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w8, #1024 ; =0x400
; CHECK-NEXT: LBB10_1: ; %vector.body
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8]
-; CHECK-NEXT: add x9, x1, x8
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192
+; CHECK-NEXT: ldr q4, [x0], #64
+; CHECK-NEXT: subs x8, x8, #8
; CHECK-NEXT: tbl.16b v5, { v4 }, v0
; CHECK-NEXT: tbl.1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152995
More information about the llvm-commits
mailing list