[llvm] [AArch64][SVE] Fold ADD+CNTB to INCB and DECB (PR #118280)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 2 02:38:36 PST 2024
https://github.com/rj-jesus created https://github.com/llvm/llvm-project/pull/118280
Currently, given:
```cpp
uint64_t incb(uint64_t x) {
return x+svcntb();
}
```
LLVM generates:
```gas
incb:
addvl x0, x0, #1
ret
```
Which is functionally equivalent to:
```gas
incb:
incb x0
ret
```
However, according to the SWOG, on microarchitectures like the Neoverse V2 and Neoverse V3, the second form (with INCB) can have significantly better latency and throughput. On the Neoverse V2, for example, ADDVL has a latency and throughput of 2, whereas some forms of INCB have a latency of 1 and a throughput of 4 (and similarly for the Neoverse V3, though in this case the throughput is further increased to 8). The same applies to DECB. This patch adds patterns to prefer the INCB/DECB forms over ADDVL where applicable.
>From f67cb05f238c25a174965b9010892e927c2f15a7 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 29 Nov 2024 10:58:31 -0800
Subject: [PATCH] [AArch64][SVE] Fold ADD+CNTB to INCB/DECB
Currently, given:
```cpp
uint64_t incb(uint64_t x) {
return x+svcntb();
}
```
LLVM generates:
```gas
incb:
addvl x0, x0, #1
ret
```
Which is functionally equivalent to:
```gas
incb:
incb x0
ret
```
However, on microarchitectures like the Neoverse V2 and Neoverse V3, the
second form (with INCB) can have significantly better latency and
throughput. On the Neoverse V2, for example, ADDVL has a latency and
throughput of 2, whereas INCB has a latency of 1 and a throughput of 4
(and similarly for the Neoverse V3, though in this case the throughput
is further increased to 8). The same applies to DECB. This patch adds
patterns to prefer the INCB/DECB forms over ADDVL where applicable.
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 30 ++++++---
.../CodeGen/AArch64/sme-framelower-use-bp.ll | 6 +-
.../CodeGen/AArch64/sme-intrinsics-loads.ll | 8 +--
.../CodeGen/AArch64/sme-intrinsics-stores.ll | 8 +--
llvm/test/CodeGen/AArch64/sve-lsrchain.ll | 2 +-
llvm/test/CodeGen/AArch64/sve-vl-arith.ll | 8 +--
.../AArch64/sve2p1-intrinsics-ld1-single.ll | 8 +--
.../AArch64/sve2p1-intrinsics-st1-single.ll | 8 +--
.../AArch64/vscale-fixups.ll | 67 ++++++++++---------
9 files changed, 81 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fb0eb7a80c6d72..fd07a1d1f356cd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -142,11 +142,13 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
// SVE CNT/INC/RDVL
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
+def sve_cntb_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 16>">;
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
// SVE DEC
+def sve_cntb_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -16>">;
def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
@@ -2680,14 +2682,8 @@ let Predicates = [HasSVEorSME] in {
}
let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
- def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
- (ADDVL_XXI GPR64:$op, $imm)>;
-
- def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
- (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
- GPR32:$op, sub_32), $imm),
- sub_32)>;
-
+ def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm i32:$imm))),
+ (INCB_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
(INCH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2695,6 +2691,8 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
(INCD_XPiI GPR64:$op, 31, $imm)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm_neg i32:$imm))),
+ (DECB_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
(DECH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
@@ -2702,6 +2700,13 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
(DECD_XPiI GPR64:$op, 31, $imm)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+ (ADDVL_XXI GPR64:$op, $imm)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm i32:$imm))))),
+ (EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, $imm),
+ sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
(EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2715,6 +2720,10 @@ let Predicates = [HasSVEorSME] in {
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm_neg i32:$imm))))),
+ (EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, $imm),
+ sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
(EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2727,6 +2736,11 @@ let Predicates = [HasSVEorSME] in {
(EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+ (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), $imm),
+ sub_32)>;
}
// For big endian, only BITCASTs involving same sized vector types with same
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 8d028c11b4a6b0..8d81c93316a6b1 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -64,7 +64,8 @@ define void @quux() #1 {
; CHECK-NEXT: subs x9, x9, x14
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: str x9, [x19, #32] // 8-byte Folded Spill
-; CHECK-NEXT: addvl x9, x8, #1
+; CHECK-NEXT: mov x9, x8
+; CHECK-NEXT: incb x9
; CHECK-NEXT: mov w0, w9
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: mov w9, w0
@@ -147,7 +148,8 @@ define void @quux() #1 {
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: subs x9, x9, #16
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: addvl x9, x8, #2
+; CHECK-NEXT: mov x9, x8
+; CHECK-NEXT: incb x9, all, mul #2
; CHECK-NEXT: mov w0, w9
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: mov w9, w0
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index 57f8e5438eaf2b..09a8ae4c595f7d 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -271,9 +271,9 @@ define void @ldr_with_off_15(ptr %ptr) {
define void @ldr_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_15mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: addvl x8, x0, #15
-; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
define void @ldr_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_16mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: addvl x8, x0, #16
-; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index 1ff32aade4a1f9..a0a40e6a5eff54 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -271,9 +271,9 @@ define void @str_with_off_15(ptr %ptr) {
define void @str_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_15mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: addvl x8, x0, #15
-; CHECK-NEXT: str za[w12, 0], [x8]
+; CHECK-NEXT: str za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @str_with_off_15mulvl(ptr %ptr) {
define void @str_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_16mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: addvl x8, x0, #16
-; CHECK-NEXT: str za[w12, 0], [x8]
+; CHECK-NEXT: str za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
index 1931cfc2ef51de..5efae5da73b8d8 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
@@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x4, #3, mul vl]
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
; CHECK-NEXT: st1h { z4.h }, p0, [x16, #3, mul vl]
-; CHECK-NEXT: addvl x16, x16, #4
+; CHECK-NEXT: incb x16, all, mul #4
; CHECK-NEXT: cmp x16, x11
; CHECK-NEXT: b.lo .LBB0_4
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index dad357c8a0c132..35f4e19da8423b 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -123,7 +123,7 @@ define i64 @incb_scalar_i64(i64 %a) {
;
; CHECK-LABEL: incb_scalar_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x0, x0, #1
+; CHECK-NEXT: incb x0
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mul = mul i64 %vscale, 16
@@ -193,7 +193,7 @@ define i64 @decb_scalar_i64(i64 %a) {
;
; CHECK-LABEL: decb_scalar_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x0, x0, #-2
+; CHECK-NEXT: decb x0, all, mul #2
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mul = mul i64 %vscale, 32
@@ -264,7 +264,7 @@ define i32 @incb_scalar_i32(i32 %a) {
; CHECK-LABEL: incb_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: addvl x0, x0, #3
+; CHECK-NEXT: incb x0, all, mul #3
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -350,7 +350,7 @@ define i32 @decb_scalar_i32(i32 %a) {
; CHECK-LABEL: decb_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: addvl x0, x0, #-4
+; CHECK-NEXT: decb x0, all, mul #4
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
index 39ee4510d51b48..d504e39a01dd0d 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
@@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba
define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #2
-; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
+; CHECK-NEXT: incb x0, all, mul #2
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
@@ -101,8 +101,8 @@ define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, ptr %ba
define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #-5
-; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
+; CHECK-NEXT: decb x0, all, mul #5
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
index 4ffc0b42d07118..538c585d864ab2 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
@@ -30,8 +30,8 @@ define void @test_svst1wq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred
define void @test_svst1wq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svst1wq_i32_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #2
-; CHECK-NEXT: st1w { z0.q }, p0, [x8]
+; CHECK-NEXT: incb x0, all, mul #2
+; CHECK-NEXT: st1w { z0.q }, p0, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
@@ -91,8 +91,8 @@ define void @test_svst1dq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred
define void @test_svst1dq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svst1dq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #-5
-; CHECK-NEXT: st1d { z0.q }, p0, [x8]
+; CHECK-NEXT: decb x0, all, mul #5
+; CHECK-NEXT: st1d { z0.q }, p0, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 588696d20227fd..6707939abe4eb8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -17,13 +17,13 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl]
; COMMON-NEXT: ld1b { z3.b }, p0/z, [x0, #3, mul vl]
-; COMMON-NEXT: addvl x0, x0, #5
+; COMMON-NEXT: incb x0, all, mul #5
; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b
; COMMON-NEXT: movprfx z1, z2
; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b
; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b
; COMMON-NEXT: st1b { z0.b }, p0, [x1, x8]
-; COMMON-NEXT: addvl x8, x8, #1
+; COMMON-NEXT: incb x8
; COMMON-NEXT: cmp x8, x2
; COMMON-NEXT: b.lo .LBB0_1
; COMMON-NEXT: // %bb.2: // %for.exit
@@ -71,13 +71,13 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i
; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, x2]
; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl]
; COMMON-NEXT: ld1b { z3.b }, p0/z, [x8, #1, mul vl]
+; COMMON-NEXT: incb x0, all, mul #2
; COMMON-NEXT: subs x3, x3, #1
-; COMMON-NEXT: addvl x0, x0, #2
; COMMON-NEXT: add z0.b, z0.b, z1.b
; COMMON-NEXT: add z1.b, z2.b, z3.b
; COMMON-NEXT: st1b { z0.h }, p1, [x1]
; COMMON-NEXT: st1b { z1.h }, p1, [x1, #1, mul vl]
-; COMMON-NEXT: addvl x1, x1, #2
+; COMMON-NEXT: incb x1, all, mul #2
; COMMON-NEXT: b.ne .LBB1_1
; COMMON-NEXT: // %bb.2: // %for.exit
; COMMON-NEXT: ret
@@ -157,56 +157,56 @@ for.exit:
define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 {
; BASE-LABEL: mixed_offsets_scalable_then_fixed:
; BASE: // %bb.0: // %entry
+; BASE-NEXT: incb x0, all, mul #4
; BASE-NEXT: ptrue p0.s
-; BASE-NEXT: addvl x8, x0, #4
-; BASE-NEXT: mov x9, #8 // =0x8
+; BASE-NEXT: mov x8, #8 // =0x8
; BASE-NEXT: .LBB3_1: // %for.body
; BASE-NEXT: // =>This Inner Loop Header: Depth=1
-; BASE-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl]
-; BASE-NEXT: ld1w { z1.s }, p0/z, [x8]
+; BASE-NEXT: ld1w { z0.s }, p0/z, [x0, #-4, mul vl]
+; BASE-NEXT: ld1w { z1.s }, p0/z, [x0]
; BASE-NEXT: decw x2
-; BASE-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
-; BASE-NEXT: addvl x8, x8, #1
+; BASE-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
+; BASE-NEXT: incb x0
; BASE-NEXT: add z0.s, z0.s, z1.s
; BASE-NEXT: add z0.s, z0.s, z2.s
; BASE-NEXT: st1w { z0.s }, p0, [x1]
-; BASE-NEXT: addvl x1, x1, #1
+; BASE-NEXT: incb x1
; BASE-NEXT: cbnz x2, .LBB3_1
; BASE-NEXT: // %bb.2: // %for.exit
; BASE-NEXT: ret
;
; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed:
; PREINDEX: // %bb.0: // %entry
+; PREINDEX-NEXT: incb x0, all, mul #4
; PREINDEX-NEXT: ptrue p0.s
-; PREINDEX-NEXT: addvl x8, x0, #4
-; PREINDEX-NEXT: mov x9, #8 // =0x8
+; PREINDEX-NEXT: mov x8, #8 // =0x8
; PREINDEX-NEXT: .LBB3_1: // %for.body
; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1
-; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl]
-; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x8]
+; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x0, #-4, mul vl]
+; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0]
; PREINDEX-NEXT: decw x2
-; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
-; PREINDEX-NEXT: addvl x8, x8, #1
+; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
+; PREINDEX-NEXT: incb x0
; PREINDEX-NEXT: add z0.s, z0.s, z1.s
; PREINDEX-NEXT: add z0.s, z0.s, z2.s
; PREINDEX-NEXT: st1w { z0.s }, p0, [x1]
-; PREINDEX-NEXT: addvl x1, x1, #1
+; PREINDEX-NEXT: incb x1
; PREINDEX-NEXT: cbnz x2, .LBB3_1
; PREINDEX-NEXT: // %bb.2: // %for.exit
; PREINDEX-NEXT: ret
;
; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed:
; POSTINDEX: // %bb.0: // %entry
+; POSTINDEX-NEXT: incb x0, all, mul #4
; POSTINDEX-NEXT: ptrue p0.s
; POSTINDEX-NEXT: mov x8, xzr
-; POSTINDEX-NEXT: addvl x9, x0, #4
-; POSTINDEX-NEXT: mov x10, #8 // =0x8
+; POSTINDEX-NEXT: mov x9, #8 // =0x8
; POSTINDEX-NEXT: .LBB3_1: // %for.body
; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1
-; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x9, #-4, mul vl]
-; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x9]
-; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x9, x10, lsl #2]
-; POSTINDEX-NEXT: addvl x9, x9, #1
+; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0, #-4, mul vl]
+; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x0]
+; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
+; POSTINDEX-NEXT: incb x0
; POSTINDEX-NEXT: add z0.s, z0.s, z1.s
; POSTINDEX-NEXT: add z0.s, z0.s, z2.s
; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -244,11 +244,12 @@ for.exit:
define void @mixed_offsets_fixed_then_scalable(ptr %src, ptr %dst, i64 %count) #0 {
; COMMON-LABEL: mixed_offsets_fixed_then_scalable:
; COMMON: // %bb.0: // %entry
-; COMMON-NEXT: addvl x9, x0, #4
+; COMMON-NEXT: mov x9, x0
; COMMON-NEXT: ptrue p0.s
; COMMON-NEXT: mov x8, xzr
-; COMMON-NEXT: add x9, x9, #32
+; COMMON-NEXT: incb x9, all, mul #4
; COMMON-NEXT: mov x10, #8 // =0x8
+; COMMON-NEXT: add x9, x9, #32
; COMMON-NEXT: .LBB4_1: // %for.body
; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
; COMMON-NEXT: add x11, x0, x8, lsl #2
@@ -308,11 +309,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; BASE-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl]
; BASE-NEXT: decw x2
; BASE-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
-; BASE-NEXT: addvl x0, x0, #1
+; BASE-NEXT: incb x0
; BASE-NEXT: add z0.s, z0.s, z1.s
; BASE-NEXT: add z0.s, z0.s, z2.s
; BASE-NEXT: st1w { z0.s }, p0, [x1]
-; BASE-NEXT: addvl x1, x1, #1
+; BASE-NEXT: incb x1
; BASE-NEXT: cbnz x2, .LBB5_1
; BASE-NEXT: // %bb.2: // %for.exit
; BASE-NEXT: ret
@@ -328,11 +329,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl]
; PREINDEX-NEXT: decw x2
; PREINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
-; PREINDEX-NEXT: addvl x0, x0, #1
+; PREINDEX-NEXT: incb x0
; PREINDEX-NEXT: add z0.s, z0.s, z1.s
; PREINDEX-NEXT: add z0.s, z0.s, z2.s
; PREINDEX-NEXT: st1w { z0.s }, p0, [x1]
-; PREINDEX-NEXT: addvl x1, x1, #1
+; PREINDEX-NEXT: incb x1
; PREINDEX-NEXT: cbnz x2, .LBB5_1
; PREINDEX-NEXT: // %bb.2: // %for.exit
; PREINDEX-NEXT: ret
@@ -348,7 +349,7 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0]
; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl]
; POSTINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x9]
-; POSTINDEX-NEXT: addvl x0, x0, #1
+; POSTINDEX-NEXT: incb x0
; POSTINDEX-NEXT: add z0.s, z0.s, z1.s
; POSTINDEX-NEXT: add z0.s, z0.s, z2.s
; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -405,9 +406,9 @@ define void @vscale_squared_offset(ptr %alloc) #0 {
; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
; COMMON-NEXT: add x11, x0, x9
; COMMON-NEXT: st1w { z0.s }, p0, [x0]
-; COMMON-NEXT: add x8, x8, #1
+; COMMON-NEXT: incb x0
; COMMON-NEXT: st1w { z1.s }, p0, [x11]
-; COMMON-NEXT: addvl x0, x0, #1
+; COMMON-NEXT: add x8, x8, #1
; COMMON-NEXT: cmp x8, x10
; COMMON-NEXT: b.lt .LBB6_1
; COMMON-NEXT: .LBB6_2: // %for.exit
More information about the llvm-commits
mailing list