[llvm] [AArch64][CodeGen] Optimize register zero initialization in svsub_x (PR #149840)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 09:12:52 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: <YafetBeyene> (yafet-a)
<details>
<summary>Changes</summary>
There is a difference in how registers are zeroed in GCC and LLVM.
GCC:
```asm
f(__SVInt32_t, __SVBool_t):
movi d0, #<!-- -->0
ret
```
LLVM:
```asm
f(__SVInt32_t, __SVBool_t):
movi v0.2d, #<!-- -->0000000000000000
ret
```
The full example is provided below:
https://godbolt.org/z/EKEarzT4s
Both instructions effectively zero (at least) the lower 64 bits of the V0 register, but the second one explicitly zeros the entire 128-bit register while the first only explicitly zeros the lower 64 bits.
---
Full diff: https://github.com/llvm/llvm-project/pull/149840.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+12)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+13)
- (added) llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll (+99)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f026726c3f484..1b4579ace80bd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22350,12 +22350,24 @@ static SDValue performIntrinsicCombine(SDNode *N,
return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_fsub_u:
+ // Detect x - x pattern and generate scalar zero initialization
+ if (N->getOperand(2) == N->getOperand(3)) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ return DAG.getNode(AArch64ISD::SVE_SCALAR_ZERO, DL, VT);
+ }
return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_add_u:
return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
N->getOperand(3));
case Intrinsic::aarch64_sve_sub_u:
+ // Detect x - x pattern and generate scalar zero intitialization
+ if (N->getOperand(2) == N->getOperand(3)) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ return DAG.getNode(AArch64ISD::SVE_SCALAR_ZERO, DL, VT);
+ }
return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
N->getOperand(3));
case Intrinsic::aarch64_sve_subr:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a2571b076e..f560346987470 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -480,6 +480,7 @@ def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
+def SDT_AArch64SVEScalarZero : SDTypeProfile<1, 0, [SDTCisVec<0>]>;
def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>]>;
@@ -996,6 +997,9 @@ def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+// SVE scalar zero - enables x-x subtraction optimization
+def AArch64sve_scalar_zero: SDNode<"AArch64ISD::SVE_SCALAR_ZERO", SDT_AArch64SVEScalarZero>;
+
// Produces the full sequence of instructions for getting the thread pointer
// offset of a variable into X0, using the TLSDesc model.
def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
@@ -8076,6 +8080,15 @@ def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVI
def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
}
+// x-x subtraction optimizations using SVE scalar zero node
+def : Pat<(nxv16i8 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv8i16 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv4i32 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv2i64 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv8f16 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv4f32 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv2f64 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+
def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
diff --git a/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
new file mode 100644
index 0000000000000..e8423e36a4e02
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Test that svsub_u32_x(a, a) generates efficient scalar zeroing (movi d0, #0)
+; instead of vector zeroing (movi v0.2d, #0), while other zero splats remain unchanged.
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+; Test the a - a optimization case
+define <vscale x 4 x i32> @sve_svsub_same_operand(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: sve_svsub_same_operand:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %result
+}
+
+; Test that regular subtraction (a - b) is unaffected
+define <vscale x 4 x i32> @sve_svsub_different_operands(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sve_svsub_different_operands:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %result = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %result
+}
+
+; Test that floating-point different operands also work normally
+define <vscale x 4 x float> @sve_svfsub_different_operands(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: sve_svfsub_different_operands:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ %result = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 4 x float> %result
+}
+
+; Test multiple data types - all should use scalar movi d0, #0
+define <vscale x 16 x i8> @sve_svsub_same_operand_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
+ ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 8 x i16> @sve_svsub_same_operand_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %a)
+ ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 2 x i64> @sve_svsub_same_operand_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %result
+}
+
+; Test floating-point subtraction optimization
+define <vscale x 4 x float> @sve_svfsub_same_operand_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+; CHECK-LABEL: sve_svfsub_same_operand_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %a)
+ ret <vscale x 4 x float> %result
+}
+
+define <vscale x 2 x double> @sve_svfsub_same_operand_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: sve_svfsub_same_operand_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %result = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %result
+}
+
+; Test that regular zero splats still use vector instruction (preserving existing behavior)
+define <vscale x 4 x i32> @sve_regular_zero_splat() {
+; CHECK-LABEL: sve_regular_zero_splat:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ret
+ ret <vscale x 4 x i32> zeroinitializer
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/149840
More information about the llvm-commits
mailing list