[llvm] [AArch64][CodeGen] Add patterns for small negative VScale const (PR #89607)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 22 07:09:29 PDT 2024
https://github.com/vfdff created https://github.com/llvm/llvm-project/pull/89607
On AArch64, rdvl can accept a nagative value, while cntd/cntw/cnth can't.
As we do support VScale with a negative multiply value, so we did not limit
the negative value and instead took the hit of having the extra patterns.
Also add NoUseScalarIncVL to avoid affecting patterns works for -mattr=+use-scalar-inc-vl
Fix https://github.com/llvm/llvm-project/issues/84620
>From 1b75ddfbcd4bce9bb3e5e05ff37a1d1ea1bb666b Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Tue, 9 Apr 2024 05:25:16 -0400
Subject: [PATCH 1/2] [DAG] Prevent combination on small negative value for
vscale
Add a TargetLowering hook for it to use to determine when this
is a prefer immediate const for VScale.
On AArch64, rdvl can accept a nagative value, while cntd/cntw/cnth can't.
Set the hook to default true to avoid affecting the other backends.
Fix https://github.com/llvm/llvm-project/issues/84620
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 ++-
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 ++
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 10 ++++++++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 ++
llvm/test/CodeGen/AArch64/sve-vl-arith.ll | 3 +--
6 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a4dc097446186a..c1bec56f60e63b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3847,6 +3847,8 @@ class TargetLowering : public TargetLoweringBase {
/// legal. It is frequently not legal in PIC relocation models.
virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+ virtual bool isPreferVScaleConst(const APInt Imm) const;
+
/// On x86, return true if the operand with index OpNo is a CALL or JUMP
/// instruction, which can use either a memory constraint or an address
/// constraint. -fasm-blocks "__asm call foo" lowers to
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2f46b23a97c62c..9010d7256716fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3979,7 +3979,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
// canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
if (N1.getOpcode() == ISD::VSCALE && N1.hasOneUse()) {
const APInt &IntVal = N1.getConstantOperandAPInt(0);
- return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
+ if (TLI.isPreferVScaleConst(-IntVal))
+ return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
}
// canonicalize (sub X, step_vector(C)) to (add X, step_vector(-C))
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 962f0d98e3be90..7e234d77e7d638 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -502,6 +502,8 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
return true;
}
+bool TargetLowering::isPreferVScaleConst(const APInt Imm) const { return true; }
+
//===----------------------------------------------------------------------===//
// Optimization Methods
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f552f91929201c..c86bd6d54b0232 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10580,6 +10580,16 @@ bool AArch64TargetLowering::isOffsetFoldingLegal(
return false;
}
+bool AArch64TargetLowering::isPreferVScaleConst(const APInt Imm) const {
+ if (Subtarget->hasFeature(AArch64::FeatureUseScalarIncVL))
+ return true;
+
+ // Multi of 16 can use instruction rdvl.
+ if (!Imm.isNegatedPowerOf2() && Imm.getLoBits(3).isZero())
+ return false;
+ return true;
+}
+
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool OptForSize) const {
bool IsLegal = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3465f3be887543..2f8e9b5839a0e2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -606,6 +606,8 @@ class AArch64TargetLowering : public TargetLowering {
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ bool isPreferVScaleConst(const APInt Imm) const override;
+
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index dd4294c8d3bdcc..33869ade84aede 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -204,8 +204,7 @@ define i64 @dech_scalar_i64(i64 %a) {
; NO_SCALAR_INC-LABEL: dech_scalar_i64:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cnth x8, all, mul #3
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add x0, x0, x8
+; NO_SCALAR_INC-NEXT: sub x0, x0, x8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: dech_scalar_i64:
>From e20d65f1c4b3474eb3874da20af49e195cc54f4c Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Tue, 9 Apr 2024 05:25:16 -0400
Subject: [PATCH 2/2] [AArch64][CodeGen] Add patterns for small negative VScale
const
On AArch64, rdvl can accept a nagative value, while cntd/cntw/cnth can't.
As we do support VScale with a negative multiply value, so we did not limit
the negative value and instead took the hit of having the extra patterns.
Also add NoUseScalarIncVL to avoid affecting patterns works for -mattr=+use-scalar-inc-vl
Fix https://github.com/llvm/llvm-project/issues/84620
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 ++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 21 +++++++++++++++++++
...plex-deinterleaving-reductions-scalable.ll | 6 +++---
llvm/test/CodeGen/AArch64/sve-vl-arith.ll | 15 +++++--------
.../vscale-and-sve-cnt-demandedbits.ll | 10 ++++-----
5 files changed, 36 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b1f514f75207f0..279e3aedcc58d8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -310,6 +310,8 @@ def UseNegativeImmediates
def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
+def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
+
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dd5e11c0f5e35d..24800a42fb4acc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2562,6 +2562,27 @@ let Predicates = [HasSVEorSME] in {
sub_32)>;
}
+ // Add NoUseScalarIncVL to avoid affecting for patterns with UseScalarIncVL
+ let Predicates = [NoUseScalarIncVL] in {
+ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+ (ADDXrs GPR64:$op, (RDVLI_XI $imm), 0)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
+ (SUBXrs GPR64:$op, (CNTH_XPiI 31, $imm), 0)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
+ (SUBXrs GPR64:$op, (CNTW_XPiI 31, $imm), 0)>;
+ def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
+ (SUBXrs GPR64:$op, (CNTD_XPiI 31, $imm), 0)>;
+
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+ (ADDSWrr GPR32:$op, (EXTRACT_SUBREG (RDVLI_XI $imm), sub_32))>;
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
+ (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTH_XPiI 31, $imm), sub_32))>;
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
+ (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTW_XPiI 31, $imm), sub_32))>;
+ def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
+ (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTD_XPiI 31, $imm), sub_32))>;
+ }
+
// FIXME: BigEndian requires an additional REV instruction to satisfy the
// constraint that none of the bits change when stored to memory as one
// type, and reloaded as another type.
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 1696ac8709d406..2d9f939f0e6664 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
+; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
+; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -219,7 +219,7 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT: adds x10, x10, x9
+; CHECK-NEXT: subs x10, x10, x9
; CHECK-NEXT: add x8, x8, x13
; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index 33869ade84aede..98d96da427c4f0 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -221,8 +221,7 @@ define i64 @decw_scalar_i64(i64 %a) {
; NO_SCALAR_INC-LABEL: decw_scalar_i64:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cntw x8, all, mul #3
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add x0, x0, x8
+; NO_SCALAR_INC-NEXT: sub x0, x0, x8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: decw_scalar_i64:
@@ -239,8 +238,7 @@ define i64 @decd_scalar_i64(i64 %a) {
; NO_SCALAR_INC-LABEL: decd_scalar_i64:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cntd x8, all, mul #3
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add x0, x0, x8
+; NO_SCALAR_INC-NEXT: sub x0, x0, x8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: decd_scalar_i64:
@@ -366,8 +364,7 @@ define i32 @dech_scalar_i32(i32 %a) {
; NO_SCALAR_INC-LABEL: dech_scalar_i32:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cnth x8
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add w0, w0, w8
+; NO_SCALAR_INC-NEXT: sub w0, w0, w8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: dech_scalar_i32:
@@ -388,8 +385,7 @@ define i32 @decw_scalar_i32(i32 %a) {
; NO_SCALAR_INC-LABEL: decw_scalar_i32:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cntw x8
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add w0, w0, w8
+; NO_SCALAR_INC-NEXT: sub w0, w0, w8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: decw_scalar_i32:
@@ -410,8 +406,7 @@ define i32 @decd_scalar_i32(i32 %a) {
; NO_SCALAR_INC-LABEL: decd_scalar_i32:
; NO_SCALAR_INC: // %bb.0:
; NO_SCALAR_INC-NEXT: cntd x8
-; NO_SCALAR_INC-NEXT: neg x8, x8
-; NO_SCALAR_INC-NEXT: add w0, w0, w8
+; NO_SCALAR_INC-NEXT: sub w0, w0, w8
; NO_SCALAR_INC-NEXT: ret
;
; CHECK-LABEL: decd_scalar_i32:
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index dbdab799c83522..9572778484f8d3 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -194,7 +194,7 @@ define i32 @vscale_with_multiplier() vscale_range(1,16) {
; CHECK-LABEL: vscale_with_multiplier:
; CHECK: // %bb.0:
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: mov w9, #5
+; CHECK-NEXT: mov w9, #5 // =0x5
; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: mul x8, x8, x9
; CHECK-NEXT: and w9, w8, #0x3f
@@ -212,7 +212,7 @@ define i32 @vscale_with_negative_multiplier() vscale_range(1,16) {
; CHECK-LABEL: vscale_with_negative_multiplier:
; CHECK: // %bb.0:
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: mov x9, #-5
+; CHECK-NEXT: mov x9, #-5 // =0xfffffffffffffffb
; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: mul x8, x8, x9
; CHECK-NEXT: and w9, w8, #0xffffffc0
@@ -230,9 +230,9 @@ define i32 @pow2_vscale_with_negative_multiplier() vscale_range(1,16) {
; CHECK-LABEL: pow2_vscale_with_negative_multiplier:
; CHECK: // %bb.0:
; CHECK-NEXT: cntd x8
-; CHECK-NEXT: neg x8, x8
-; CHECK-NEXT: orr w9, w8, #0xfffffff0
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: neg x9, x8
+; CHECK-NEXT: orr w9, w9, #0xfffffff0
+; CHECK-NEXT: sub w0, w9, w8
; CHECK-NEXT: ret
%vscale = call i32 @llvm.vscale.i32()
%mul = mul i32 %vscale, -2
More information about the llvm-commits
mailing list