[llvm] [AArch64][CodeGen] Optimize register zero initialization in svsub_x (PR #149840)

Mon Jul 21 09:12:22 PDT 2025

https://github.com/yafet-a created https://github.com/llvm/llvm-project/pull/149840

There is a difference in how registers are zeroed in GCC and LLVM.
GCC:
```asm
f(__SVInt32_t, __SVBool_t):       
  movi    d0, #0        
  ret 
```
LLVM:
```asm
f(__SVInt32_t, __SVBool_t):        
  movi    v0.2d, #0000000000000000        
  ret 
```

The full example is provided below:
https://godbolt.org/z/EKEarzT4s

Both instructions effectively zero (at least) the lower 64 bits of the V0 register, but the second one explicitly zeros the entire 128-bit register while the first only explicitly zeros the lower 64 bits.


>From 174a4303b0e40ac85a0a6cddc525ceb928fdfa0a Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Mon, 21 Jul 2025 08:19:50 -0700
Subject: [PATCH 1/2] [AArch64][Tests] Pre-Commit Test

---
 .../CodeGen/AArch64/sve-svsub-same-operand.ll | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
new file mode 100644
index 0000000000000..957641c69c0a4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Test that svsub_u32_x(a, a) generates efficient scalar zeroing (movi d0, #0)
+; instead of vector zeroing (movi v0.2d, #0), while other zero splats remain unchanged.
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.u.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+; Test the a - a optimization case
+define <vscale x 4 x i32> @sve_svsub_same_operand(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: sve_svsub_same_operand:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %result = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %result
+}
+
+; Test that regular subtraction (a - b) is unaffected
+define <vscale x 4 x i32> @sve_svsub_different_operands(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sve_svsub_different_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %result = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %result
+}
+
+; Test that floating-point different operands also work normally
+define <vscale x 4 x float> @sve_svfsub_different_operands(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: sve_svfsub_different_operands:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %result = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %result
+}
+
+; Test multiple data types - all should use scalar movi d0, #0
+define <vscale x 16 x i8> @sve_svsub_same_operand_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %result = call <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %result
+}
+
+define <vscale x 8 x i16> @sve_svsub_same_operand_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %result = call <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %result
+}
+
+define <vscale x 2 x i64> @sve_svsub_same_operand_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: sve_svsub_same_operand_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  %result = call <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %result
+}
+
+; Test floating-point subtraction optimization
+define <vscale x 4 x float> @sve_svfsub_same_operand_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+; CHECK-LABEL: sve_svfsub_same_operand_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsubr z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    ret
+  %result = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %a)
+  ret <vscale x 4 x float> %result
+}
+
+define <vscale x 2 x double> @sve_svfsub_same_operand_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: sve_svfsub_same_operand_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsubr z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    ret
+  %result = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %result
+}
+
+; Test that regular zero splats still use vector instruction (preserving existing behavior)
+define <vscale x 4 x i32> @sve_regular_zero_splat() {
+; CHECK-LABEL: sve_regular_zero_splat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ret
+  ret <vscale x 4 x i32> zeroinitializer
+}

>From d0e5673594df321128887d32c6b8dd3e581fdab7 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Mon, 21 Jul 2025 08:30:53 -0700
Subject: [PATCH 2/2] [AArch64][CodeGen] Introduce SVE scalar zero node to
 optimize x-x subtraction patterns

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp     | 12 ++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td         | 13 +++++++++++++
 llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll | 12 ++++++------
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f026726c3f484..1b4579ace80bd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22350,12 +22350,24 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_fsub_u:
+    // Detect x - x pattern and generate scalar zero initialization
+    if (N->getOperand(2) == N->getOperand(3)) {
+      SDLoc DL(N);
+      EVT VT = N->getValueType(0);
+      return DAG.getNode(AArch64ISD::SVE_SCALAR_ZERO, DL, VT);
+    }
     return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_add_u:
     return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
                        N->getOperand(3));
   case Intrinsic::aarch64_sve_sub_u:
+    // Detect x - x pattern and generate scalar zero intitialization
+    if (N->getOperand(2) == N->getOperand(3)) {
+      SDLoc DL(N);
+      EVT VT = N->getValueType(0);
+      return DAG.getNode(AArch64ISD::SVE_SCALAR_ZERO, DL, VT);
+    }
     return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
                        N->getOperand(3));
   case Intrinsic::aarch64_sve_subr:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9f8a2571b076e..f560346987470 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -480,6 +480,7 @@ def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
 def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
 def SDT_AArch64Insr  : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
+def SDT_AArch64SVEScalarZero : SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                           SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<0, 2>]>;
@@ -996,6 +997,9 @@ def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
 def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
 def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
 
+// SVE scalar zero - enables x-x subtraction optimization
+def AArch64sve_scalar_zero: SDNode<"AArch64ISD::SVE_SCALAR_ZERO", SDT_AArch64SVEScalarZero>;
+
 // Produces the full sequence of instructions for getting the thread pointer
 // offset of a variable into X0, using the TLSDesc model.
 def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
@@ -8076,6 +8080,15 @@ def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVI
 def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>;
 }
 
+// x-x subtraction optimizations using SVE scalar zero node
+def : Pat<(nxv16i8 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv8i16 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv4i32 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv2i64 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv8f16 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv4f32 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+def : Pat<(nxv2f64 (AArch64sve_scalar_zero)), (SUBREG_TO_REG (i32 0), (MOVID (i32 0)), dsub)>;
+
 def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
diff --git a/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
index 957641c69c0a4..e8423e36a4e02 100644
--- a/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
+++ b/llvm/test/CodeGen/AArch64/sve-svsub-same-operand.ll
@@ -16,7 +16,7 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1>
 define <vscale x 4 x i32> @sve_svsub_same_operand(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sve_svsub_same_operand:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %result
@@ -46,7 +46,7 @@ define <vscale x 4 x float> @sve_svfsub_different_operands(<vscale x 4 x i1> %pg
 define <vscale x 16 x i8> @sve_svsub_same_operand_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sve_svsub_same_operand_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %result
@@ -55,7 +55,7 @@ define <vscale x 16 x i8> @sve_svsub_same_operand_i8(<vscale x 16 x i1> %pg, <vs
 define <vscale x 8 x i16> @sve_svsub_same_operand_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sve_svsub_same_operand_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %result
@@ -64,7 +64,7 @@ define <vscale x 8 x i16> @sve_svsub_same_operand_i16(<vscale x 8 x i1> %pg, <vs
 define <vscale x 2 x i64> @sve_svsub_same_operand_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sve_svsub_same_operand_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %result
@@ -74,7 +74,7 @@ define <vscale x 2 x i64> @sve_svsub_same_operand_i64(<vscale x 2 x i1> %pg, <vs
 define <vscale x 4 x float> @sve_svfsub_same_operand_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: sve_svfsub_same_operand_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fsubr z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %a)
   ret <vscale x 4 x float> %result
@@ -83,7 +83,7 @@ define <vscale x 4 x float> @sve_svfsub_same_operand_f32(<vscale x 4 x i1> %pg,
 define <vscale x 2 x double> @sve_svfsub_same_operand_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
 ; CHECK-LABEL: sve_svfsub_same_operand_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fsubr z0.d, p0/m, z0.d, z0.d
+; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
   %result = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
   ret <vscale x 2 x double> %result