[llvm] [AArch64][SVE] Fold ADD+CNTB to INCB and DECB (PR #118280)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 9 10:07:34 PDT 2025
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/118280
>From 8a67870719d8cb1d8b66e5b7053d74e70ac96574 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 29 Nov 2024 10:58:31 -0800
Subject: [PATCH 1/2] [AArch64][SVE] Fold ADD+CNTB to INCB/DECB
Currently, given:
```cpp
uint64_t incb(uint64_t x) {
return x+svcntb();
}
```
LLVM generates:
```gas
incb:
addvl x0, x0, #1
ret
```
Which is functionally equivalent to:
```gas
incb:
incb x0
ret
```
However, on microarchitectures like the Neoverse V2 and Neoverse V3, the
second form (with INCB) can have significantly better latency and
throughput. On the Neoverse V2, for example, ADDVL has a latency and
throughput of 2, whereas INCB has a latency of 1 and a throughput of 4
(and similarly for the Neoverse V3, though in this case the throughput
is further increased to 8). The same applies to DECB. This patch adds
patterns to prefer the INCB/DECB forms over ADDVL where applicable.
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 30 ++++++---
.../CodeGen/AArch64/sme-framelower-use-bp.ll | 6 +-
.../CodeGen/AArch64/sme-intrinsics-loads.ll | 8 +--
.../CodeGen/AArch64/sme-intrinsics-stores.ll | 8 +--
llvm/test/CodeGen/AArch64/sve-lsrchain.ll | 2 +-
llvm/test/CodeGen/AArch64/sve-vl-arith.ll | 8 +--
.../AArch64/sve2p1-intrinsics-ld1-single.ll | 8 +--
.../AArch64/sve2p1-intrinsics-st1-single.ll | 8 +--
.../AArch64/vscale-fixups.ll | 67 ++++++++++---------
9 files changed, 81 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index a2f326c994c2f..813907a6e8e15 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -142,11 +142,13 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
// SVE CNT/INC/RDVL
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
+def sve_cntb_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 16>">;
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
// SVE DEC
+def sve_cntb_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -16>">;
def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
@@ -2678,14 +2680,8 @@ let Predicates = [HasSVE_or_SME] in {
}
let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in {
- def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
- (ADDVL_XXI GPR64:$op, $imm)>;
-
- def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
- (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
- GPR32:$op, sub_32), $imm),
- sub_32)>;
-
+ def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm i32:$imm))),
+ (INCB_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
(INCH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2693,6 +2689,8 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
(INCD_XPiI GPR64:$op, 31, $imm)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm_neg i32:$imm))),
+ (DECB_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
(DECH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
@@ -2700,6 +2698,13 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
(DECD_XPiI GPR64:$op, 31, $imm)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+ (ADDVL_XXI GPR64:$op, $imm)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm i32:$imm))))),
+ (EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, $imm),
+ sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
(EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2713,6 +2718,10 @@ let Predicates = [HasSVE_or_SME] in {
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm_neg i32:$imm))))),
+ (EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, $imm),
+ sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
(EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2725,6 +2734,11 @@ let Predicates = [HasSVE_or_SME] in {
(EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+ (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), $imm),
+ sub_32)>;
}
// For big endian, only BITCASTs involving same sized vector types with same
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index f49bb910b5bd1..99c65b090adb0 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -65,7 +65,8 @@ define void @quux() #1 {
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: sub x10, x29, #104
; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: addvl x9, x8, #1
+; CHECK-NEXT: mov x9, x8
+; CHECK-NEXT: incb x9
; CHECK-NEXT: mov w0, w9
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: mov w9, w0
@@ -160,7 +161,8 @@ define void @quux() #1 {
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: subs x9, x9, #16
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: addvl x9, x8, #2
+; CHECK-NEXT: mov x9, x8
+; CHECK-NEXT: incb x9, all, mul #2
; CHECK-NEXT: mov w0, w9
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: mov w9, w0
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index 57f8e5438eaf2..09a8ae4c595f7 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -271,9 +271,9 @@ define void @ldr_with_off_15(ptr %ptr) {
define void @ldr_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_15mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: addvl x8, x0, #15
-; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
define void @ldr_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_16mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: addvl x8, x0, #16
-; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index 1ff32aade4a1f..a0a40e6a5eff5 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -271,9 +271,9 @@ define void @str_with_off_15(ptr %ptr) {
define void @str_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_15mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: addvl x8, x0, #15
-; CHECK-NEXT: str za[w12, 0], [x8]
+; CHECK-NEXT: str za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @str_with_off_15mulvl(ptr %ptr) {
define void @str_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_16mulvl:
; CHECK: // %bb.0:
+; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: addvl x8, x0, #16
-; CHECK-NEXT: str za[w12, 0], [x8]
+; CHECK-NEXT: str za[w12, 0], [x0]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
index 78f93f1ecbb26..d94fa6433bb7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
@@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
; CHECK-NEXT: ldr z5, [x4, #3, mul vl]
; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h
; CHECK-NEXT: str z4, [x16, #3, mul vl]
-; CHECK-NEXT: addvl x16, x16, #4
+; CHECK-NEXT: incb x16, all, mul #4
; CHECK-NEXT: cmp x16, x11
; CHECK-NEXT: b.lo .LBB0_4
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index a6c0e5aa70583..8cdbdb0a751c8 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -123,7 +123,7 @@ define i64 @incb_scalar_i64(i64 %a) {
;
; CHECK-LABEL: incb_scalar_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x0, x0, #1
+; CHECK-NEXT: incb x0
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mul = mul i64 %vscale, 16
@@ -193,7 +193,7 @@ define i64 @decb_scalar_i64(i64 %a) {
;
; CHECK-LABEL: decb_scalar_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x0, x0, #-2
+; CHECK-NEXT: decb x0, all, mul #2
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mul = mul i64 %vscale, 32
@@ -264,7 +264,7 @@ define i32 @incb_scalar_i32(i32 %a) {
; CHECK-LABEL: incb_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: addvl x0, x0, #3
+; CHECK-NEXT: incb x0, all, mul #3
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
@@ -350,7 +350,7 @@ define i32 @decb_scalar_i32(i32 %a) {
; CHECK-LABEL: decb_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: addvl x0, x0, #-4
+; CHECK-NEXT: decb x0, all, mul #4
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
index 39ee4510d51b4..d504e39a01dd0 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
@@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba
define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svld1uwq_i32_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #2
-; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8]
+; CHECK-NEXT: incb x0, all, mul #2
+; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep)
@@ -101,8 +101,8 @@ define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, ptr %ba
define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #-5
-; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
+; CHECK-NEXT: decb x0, all, mul #5
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
index 4ffc0b42d0711..538c585d864ab 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
@@ -30,8 +30,8 @@ define void @test_svst1wq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred
define void @test_svst1wq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svst1wq_i32_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #2
-; CHECK-NEXT: st1w { z0.q }, p0, [x8]
+; CHECK-NEXT: incb x0, all, mul #2
+; CHECK-NEXT: st1w { z0.q }, p0, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8
call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep)
@@ -91,8 +91,8 @@ define void @test_svst1dq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred
define void @test_svst1dq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svst1dq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: addvl x8, x0, #-5
-; CHECK-NEXT: st1d { z0.q }, p0, [x8]
+; CHECK-NEXT: decb x0, all, mul #5
+; CHECK-NEXT: st1d { z0.q }, p0, [x0]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index bae69ef590f52..d655206b39f1d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -17,13 +17,13 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
; COMMON-NEXT: ldr z1, [x0, #1, mul vl]
; COMMON-NEXT: ldr z2, [x0, #2, mul vl]
; COMMON-NEXT: ldr z3, [x0, #3, mul vl]
-; COMMON-NEXT: addvl x0, x0, #5
+; COMMON-NEXT: incb x0, all, mul #5
; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b
; COMMON-NEXT: movprfx z1, z2
; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b
; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b
; COMMON-NEXT: st1b { z0.b }, p0, [x1, x8]
-; COMMON-NEXT: addvl x8, x8, #1
+; COMMON-NEXT: incb x8
; COMMON-NEXT: cmp x8, x2
; COMMON-NEXT: b.lo .LBB0_1
; COMMON-NEXT: // %bb.2: // %for.exit
@@ -71,13 +71,13 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i
; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, x2]
; COMMON-NEXT: ldr z2, [x0, #1, mul vl]
; COMMON-NEXT: ldr z3, [x8, #1, mul vl]
+; COMMON-NEXT: incb x0, all, mul #2
; COMMON-NEXT: subs x3, x3, #1
-; COMMON-NEXT: addvl x0, x0, #2
; COMMON-NEXT: add z0.b, z0.b, z1.b
; COMMON-NEXT: add z1.b, z2.b, z3.b
; COMMON-NEXT: st1b { z0.h }, p1, [x1]
; COMMON-NEXT: st1b { z1.h }, p1, [x1, #1, mul vl]
-; COMMON-NEXT: addvl x1, x1, #2
+; COMMON-NEXT: incb x1, all, mul #2
; COMMON-NEXT: b.ne .LBB1_1
; COMMON-NEXT: // %bb.2: // %for.exit
; COMMON-NEXT: ret
@@ -156,56 +156,56 @@ for.exit:
define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 {
; BASE-LABEL: mixed_offsets_scalable_then_fixed:
; BASE: // %bb.0: // %entry
+; BASE-NEXT: incb x0, all, mul #4
; BASE-NEXT: ptrue p0.s
-; BASE-NEXT: addvl x8, x0, #4
-; BASE-NEXT: mov x9, #8 // =0x8
+; BASE-NEXT: mov x8, #8 // =0x8
; BASE-NEXT: .LBB3_1: // %for.body
; BASE-NEXT: // =>This Inner Loop Header: Depth=1
-; BASE-NEXT: ldr z0, [x8, #-4, mul vl]
-; BASE-NEXT: ldr z1, [x8]
+; BASE-NEXT: ldr z0, [x0, #-4, mul vl]
+; BASE-NEXT: ldr z1, [x0]
; BASE-NEXT: decw x2
-; BASE-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
-; BASE-NEXT: addvl x8, x8, #1
+; BASE-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
+; BASE-NEXT: incb x0
; BASE-NEXT: add z0.s, z0.s, z1.s
; BASE-NEXT: add z0.s, z0.s, z2.s
; BASE-NEXT: str z0, [x1]
-; BASE-NEXT: addvl x1, x1, #1
+; BASE-NEXT: incb x1
; BASE-NEXT: cbnz x2, .LBB3_1
; BASE-NEXT: // %bb.2: // %for.exit
; BASE-NEXT: ret
;
; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed:
; PREINDEX: // %bb.0: // %entry
+; PREINDEX-NEXT: incb x0, all, mul #4
; PREINDEX-NEXT: ptrue p0.s
-; PREINDEX-NEXT: addvl x8, x0, #4
-; PREINDEX-NEXT: mov x9, #8 // =0x8
+; PREINDEX-NEXT: mov x8, #8 // =0x8
; PREINDEX-NEXT: .LBB3_1: // %for.body
; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1
-; PREINDEX-NEXT: ldr z0, [x8, #-4, mul vl]
-; PREINDEX-NEXT: ldr z1, [x8]
+; PREINDEX-NEXT: ldr z0, [x0, #-4, mul vl]
+; PREINDEX-NEXT: ldr z1, [x0]
; PREINDEX-NEXT: decw x2
-; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2]
-; PREINDEX-NEXT: addvl x8, x8, #1
+; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
+; PREINDEX-NEXT: incb x0
; PREINDEX-NEXT: add z0.s, z0.s, z1.s
; PREINDEX-NEXT: add z0.s, z0.s, z2.s
; PREINDEX-NEXT: str z0, [x1]
-; PREINDEX-NEXT: addvl x1, x1, #1
+; PREINDEX-NEXT: incb x1
; PREINDEX-NEXT: cbnz x2, .LBB3_1
; PREINDEX-NEXT: // %bb.2: // %for.exit
; PREINDEX-NEXT: ret
;
; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed:
; POSTINDEX: // %bb.0: // %entry
+; POSTINDEX-NEXT: incb x0, all, mul #4
; POSTINDEX-NEXT: ptrue p0.s
; POSTINDEX-NEXT: mov x8, xzr
-; POSTINDEX-NEXT: addvl x9, x0, #4
-; POSTINDEX-NEXT: mov x10, #8 // =0x8
+; POSTINDEX-NEXT: mov x9, #8 // =0x8
; POSTINDEX-NEXT: .LBB3_1: // %for.body
; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1
-; POSTINDEX-NEXT: ldr z0, [x9, #-4, mul vl]
-; POSTINDEX-NEXT: ldr z1, [x9]
-; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x9, x10, lsl #2]
-; POSTINDEX-NEXT: addvl x9, x9, #1
+; POSTINDEX-NEXT: ldr z0, [x0, #-4, mul vl]
+; POSTINDEX-NEXT: ldr z1, [x0]
+; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
+; POSTINDEX-NEXT: incb x0
; POSTINDEX-NEXT: add z0.s, z0.s, z1.s
; POSTINDEX-NEXT: add z0.s, z0.s, z2.s
; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -243,11 +243,12 @@ for.exit:
define void @mixed_offsets_fixed_then_scalable(ptr %src, ptr %dst, i64 %count) #0 {
; COMMON-LABEL: mixed_offsets_fixed_then_scalable:
; COMMON: // %bb.0: // %entry
-; COMMON-NEXT: addvl x9, x0, #4
+; COMMON-NEXT: mov x9, x0
; COMMON-NEXT: ptrue p0.s
; COMMON-NEXT: mov x8, xzr
-; COMMON-NEXT: add x9, x9, #32
+; COMMON-NEXT: incb x9, all, mul #4
; COMMON-NEXT: mov x10, #8 // =0x8
+; COMMON-NEXT: add x9, x9, #32
; COMMON-NEXT: .LBB4_1: // %for.body
; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
; COMMON-NEXT: add x11, x0, x8, lsl #2
@@ -304,11 +305,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; BASE-NEXT: ldr z1, [x0, #4, mul vl]
; BASE-NEXT: decw x2
; BASE-NEXT: ldr z2, [x0, #8, mul vl]
-; BASE-NEXT: addvl x0, x0, #1
+; BASE-NEXT: incb x0
; BASE-NEXT: add z0.s, z0.s, z1.s
; BASE-NEXT: add z0.s, z0.s, z2.s
; BASE-NEXT: str z0, [x1]
-; BASE-NEXT: addvl x1, x1, #1
+; BASE-NEXT: incb x1
; BASE-NEXT: cbnz x2, .LBB5_1
; BASE-NEXT: // %bb.2: // %for.exit
; BASE-NEXT: ret
@@ -321,11 +322,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; PREINDEX-NEXT: ldr z1, [x0, #4, mul vl]
; PREINDEX-NEXT: decw x2
; PREINDEX-NEXT: ldr z2, [x0, #8, mul vl]
-; PREINDEX-NEXT: addvl x0, x0, #1
+; PREINDEX-NEXT: incb x0
; PREINDEX-NEXT: add z0.s, z0.s, z1.s
; PREINDEX-NEXT: add z0.s, z0.s, z2.s
; PREINDEX-NEXT: str z0, [x1]
-; PREINDEX-NEXT: addvl x1, x1, #1
+; PREINDEX-NEXT: incb x1
; PREINDEX-NEXT: cbnz x2, .LBB5_1
; PREINDEX-NEXT: // %bb.2: // %for.exit
; PREINDEX-NEXT: ret
@@ -339,7 +340,7 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 {
; POSTINDEX-NEXT: ldr z0, [x0]
; POSTINDEX-NEXT: ldr z1, [x0, #4, mul vl]
; POSTINDEX-NEXT: ldr z2, [x0, #8, mul vl]
-; POSTINDEX-NEXT: addvl x0, x0, #1
+; POSTINDEX-NEXT: incb x0
; POSTINDEX-NEXT: add z0.s, z0.s, z1.s
; POSTINDEX-NEXT: add z0.s, z0.s, z2.s
; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -396,9 +397,9 @@ define void @vscale_squared_offset(ptr %alloc) #0 {
; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
; COMMON-NEXT: add x11, x0, x9
; COMMON-NEXT: st1w { z0.s }, p0, [x0]
-; COMMON-NEXT: add x8, x8, #1
+; COMMON-NEXT: incb x0
; COMMON-NEXT: st1w { z1.s }, p0, [x11]
-; COMMON-NEXT: addvl x0, x0, #1
+; COMMON-NEXT: add x8, x8, #1
; COMMON-NEXT: cmp x8, x10
; COMMON-NEXT: b.lt .LBB6_1
; COMMON-NEXT: .LBB6_2: // %for.exit
>From 68a93cb283a7e6f535c9f7d5329adc89e8c1b958 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 8 Apr 2025 11:52:26 -0700
Subject: [PATCH 2/2] Restrict patterns to INCB/DECB, ALL, {1, 2, 4}
---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 52 +++++++++++--------
.../CodeGen/AArch64/sme-intrinsics-loads.ll | 8 +--
.../CodeGen/AArch64/sme-intrinsics-stores.ll | 8 +--
llvm/test/CodeGen/AArch64/sve-vl-arith.ll | 2 +-
.../AArch64/sve2p1-intrinsics-ld1-single.ll | 4 +-
.../AArch64/sve2p1-intrinsics-st1-single.ll | 4 +-
.../AArch64/vscale-fixups.ll | 2 +-
7 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 813907a6e8e15..3e9c749989069 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -142,13 +142,11 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
// SVE CNT/INC/RDVL
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
-def sve_cntb_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 16>">;
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
// SVE DEC
-def sve_cntb_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -16>">;
def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
@@ -2680,8 +2678,36 @@ let Predicates = [HasSVE_or_SME] in {
}
let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in {
- def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm i32:$imm))),
- (INCB_XPiI GPR64:$op, 31, $imm)>;
+ // Some INCB/DECB forms have better latency and throughput than ADDVL on
+ // microarchitectures such as the Neoverse V2 and Neoverse V3, so we prefer
+ // using them here.
+ foreach imm = [ 1, 2, 4 ] in
+ let AddedComplexity = 6 in {
+ def : Pat<(add GPR64:$op, (vscale !mul(imm, 16))),
+ (INCB_XPiI GPR64:$op, 31, imm)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, 16))))),
+ (EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, imm),
+ sub_32)>;
+
+ def : Pat<(add GPR64:$op, (vscale !mul(imm, -16))),
+ (DECB_XPiI GPR64:$op, 31, imm)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, -16))))),
+ (EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), 31, imm),
+ sub_32)>;
+ }
+
+ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+ (ADDVL_XXI GPR64:$op, $imm)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+ (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
+ GPR32:$op, sub_32), $imm),
+ sub_32)>;
+
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
(INCH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2689,8 +2715,6 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
(INCD_XPiI GPR64:$op, 31, $imm)>;
- def : Pat<(add GPR64:$op, (vscale (sve_cntb_imm_neg i32:$imm))),
- (DECB_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
(DECH_XPiI GPR64:$op, 31, $imm)>;
def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
@@ -2698,13 +2722,6 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
(DECD_XPiI GPR64:$op, 31, $imm)>;
- def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
- (ADDVL_XXI GPR64:$op, $imm)>;
-
- def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm i32:$imm))))),
- (EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
- GPR32:$op, sub_32), 31, $imm),
- sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
(EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2718,10 +2735,6 @@ let Predicates = [HasSVE_or_SME] in {
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
- def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntb_imm_neg i32:$imm))))),
- (EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
- GPR32:$op, sub_32), 31, $imm),
- sub_32)>;
def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
(EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
@@ -2734,11 +2747,6 @@ let Predicates = [HasSVE_or_SME] in {
(EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
GPR32:$op, sub_32), 31, $imm),
sub_32)>;
-
- def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
- (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
- GPR32:$op, sub_32), $imm),
- sub_32)>;
}
// For big endian, only BITCASTs involving same sized vector types with same
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index 09a8ae4c595f7..57f8e5438eaf2 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -271,9 +271,9 @@ define void @ldr_with_off_15(ptr %ptr) {
define void @ldr_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_15mulvl:
; CHECK: // %bb.0:
-; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: ldr za[w12, 0], [x0]
+; CHECK-NEXT: addvl x8, x0, #15
+; CHECK-NEXT: ldr za[w12, 0], [x8]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
define void @ldr_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_16mulvl:
; CHECK: // %bb.0:
-; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: ldr za[w12, 0], [x0]
+; CHECK-NEXT: addvl x8, x0, #16
+; CHECK-NEXT: ldr za[w12, 0], [x8]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index a0a40e6a5eff5..1ff32aade4a1f 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -271,9 +271,9 @@ define void @str_with_off_15(ptr %ptr) {
define void @str_with_off_15mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_15mulvl:
; CHECK: // %bb.0:
-; CHECK-NEXT: incb x0, all, mul #15
; CHECK-NEXT: mov w12, #15 // =0xf
-; CHECK-NEXT: str za[w12, 0], [x0]
+; CHECK-NEXT: addvl x8, x0, #15
+; CHECK-NEXT: str za[w12, 0], [x8]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
@@ -285,9 +285,9 @@ define void @str_with_off_15mulvl(ptr %ptr) {
define void @str_with_off_16mulvl(ptr %ptr) {
; CHECK-LABEL: str_with_off_16mulvl:
; CHECK: // %bb.0:
-; CHECK-NEXT: incb x0, all, mul #16
; CHECK-NEXT: mov w12, #16 // =0x10
-; CHECK-NEXT: str za[w12, 0], [x0]
+; CHECK-NEXT: addvl x8, x0, #16
+; CHECK-NEXT: str za[w12, 0], [x8]
; CHECK-NEXT: ret
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index 8cdbdb0a751c8..12ae063c63a16 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -264,7 +264,7 @@ define i32 @incb_scalar_i32(i32 %a) {
; CHECK-LABEL: incb_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: incb x0, all, mul #3
+; CHECK-NEXT: addvl x0, x0, #3
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
index d504e39a01dd0..abb5f2aa07fc8 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll
@@ -101,8 +101,8 @@ define <vscale x 2 x i64> @test_svld1udq_i64_si(<vscale x 1 x i1> %pred, ptr %ba
define <vscale x 2 x i64> @test_svld1udq_i64_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svld1udq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: decb x0, all, mul #5
-; CHECK-NEXT: ld1d { z0.q }, p0/z, [x0]
+; CHECK-NEXT: addvl x8, x0, #-5
+; CHECK-NEXT: ld1d { z0.q }, p0/z, [x8]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
index 538c585d864ab..d4c77328be478 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll
@@ -91,8 +91,8 @@ define void @test_svst1dq_i64_si(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred
define void @test_svst1dq_i64_out_of_bound(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %base) {
; CHECK-LABEL: test_svst1dq_i64_out_of_bound:
; CHECK: // %bb.0:
-; CHECK-NEXT: decb x0, all, mul #5
-; CHECK-NEXT: st1d { z0.q }, p0, [x0]
+; CHECK-NEXT: addvl x8, x0, #-5
+; CHECK-NEXT: st1d { z0.q }, p0, [x8]
; CHECK-NEXT: ret
%gep = getelementptr inbounds <vscale x 1 x i64>, ptr %base, i64 -10
call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> %zt, <vscale x 1 x i1> %pred, ptr %gep)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index d655206b39f1d..aa954aeb0ad07 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -17,7 +17,7 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
; COMMON-NEXT: ldr z1, [x0, #1, mul vl]
; COMMON-NEXT: ldr z2, [x0, #2, mul vl]
; COMMON-NEXT: ldr z3, [x0, #3, mul vl]
-; COMMON-NEXT: incb x0, all, mul #5
+; COMMON-NEXT: addvl x0, x0, #5
; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b
; COMMON-NEXT: movprfx z1, z2
; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b
More information about the llvm-commits
mailing list