[llvm] c62d5f3 - [AArch64] Avoid folding sign-extend of vector extracts into ALU ops (#183522)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 02:13:03 PST 2026
Author: Benjamin Maxwell
Date: 2026-03-04T10:12:58Z
New Revision: c62d5f35b6785ff1fac0047c6e5d9e6dd8f745f8
URL: https://github.com/llvm/llvm-project/commit/c62d5f35b6785ff1fac0047c6e5d9e6dd8f745f8
DIFF: https://github.com/llvm/llvm-project/commit/c62d5f35b6785ff1fac0047c6e5d9e6dd8f745f8.diff
LOG: [AArch64] Avoid folding sign-extend of vector extracts into ALU ops (#183522)
This breaks a tie where the `SEXT_IN_REG` in an expression like
`SUB(0, SEXT_IN_REG(VECTOR_EXTRACT(..))` can fold into the `SUB` or the
`VECTOR_EXTRACT`. Currently, the `SUB` is always preferred, but it's
better to fold the `SEXT_IN_REG` into the `VECTOR_EXTRACT` extract,
which allows for `SMOV` to be used.
Added:
llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
llvm/test/CodeGen/AArch64/combine-sdiv.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 67f4e127b0c87..2a7e53216c62b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -989,6 +989,18 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
if (Ext == AArch64_AM::InvalidShiftExtend)
return false;
+ // Don't match sext of vector extracts. These can use SMOV, but if we match
+ // this as an extended register, we'll always fold the extend into an ALU op
+ // user of the extend (which results in a UMOV).
+ if (AArch64_AM::isSignExtendShiftType(Ext)) {
+ SDValue Op = N.getOperand(0);
+ if (Op->getOpcode() == ISD::ANY_EXTEND)
+ Op = Op->getOperand(0);
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op.getOperand(0).getValueType().isFixedLengthVector())
+ return false;
+ }
+
Reg = N.getOperand(0);
// Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 734948d55766b..62e678d5778f8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -7151,17 +7151,6 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
[](MachineInstr &Use) { return Use.mayLoadOrStore(); });
}
-static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
- switch (Type) {
- case AArch64_AM::SXTB:
- case AArch64_AM::SXTH:
- case AArch64_AM::SXTW:
- return true;
- default:
- return false;
- }
-}
-
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectExtendedSHL(
MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
@@ -7244,7 +7233,7 @@ AArch64InstructionSelector::selectExtendedSHL(
if (Ext == AArch64_AM::InvalidShiftExtend)
return std::nullopt;
- SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
+ SignExtend = AArch64_AM::isSignExtendShiftType(Ext) ? 1 : 0;
// We only support SXTW for signed extension here.
if (SignExtend && Ext != AArch64_AM::SXTW)
return std::nullopt;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 1d3f089e2130b..0492fdf3442c1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -48,6 +48,19 @@ enum ShiftExtendType {
SXTX,
};
+/// isSignExtendShiftType - Returns true if \p Type is sign extending.
+static inline bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
+ switch (Type) {
+ case AArch64_AM::SXTB:
+ case AArch64_AM::SXTH:
+ case AArch64_AM::SXTW:
+ case AArch64_AM::SXTX:
+ return true;
+ default:
+ return false;
+ }
+}
+
/// getShiftName - Get the string encoding for the shift type.
static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
switch (ST) {
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index e804a941716e3..fbb33db3fb7a9 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -801,31 +801,30 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
; CHECK-SD-LABEL: non_splat_minus_one_divisor_0:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: umov w9, v0.b[0]
-; CHECK-SD-NEXT: mov w8, wzr
-; CHECK-SD-NEXT: umov w10, v0.b[1]
-; CHECK-SD-NEXT: sub w9, w8, w9, sxtb
-; CHECK-SD-NEXT: sub w10, w8, w10, sxtb
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: smov w9, v0.b[2]
-; CHECK-SD-NEXT: mov v1.b[1], w10
-; CHECK-SD-NEXT: umov w10, v0.b[3]
-; CHECK-SD-NEXT: mov v1.b[2], w9
-; CHECK-SD-NEXT: sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT: umov w10, v0.b[4]
-; CHECK-SD-NEXT: mov v1.b[3], w9
-; CHECK-SD-NEXT: sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT: umov w10, v0.b[5]
-; CHECK-SD-NEXT: mov v1.b[4], w9
-; CHECK-SD-NEXT: sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT: umov w10, v0.b[7]
-; CHECK-SD-NEXT: mov v1.b[5], w9
-; CHECK-SD-NEXT: smov w9, v0.b[6]
-; CHECK-SD-NEXT: mov v1.b[6], w9
-; CHECK-SD-NEXT: sub w9, w8, w10, sxtb
-; CHECK-SD-NEXT: umov w10, v0.b[8]
-; CHECK-SD-NEXT: mov v1.b[7], w9
-; CHECK-SD-NEXT: sub w8, w8, w10, sxtb
+; CHECK-SD-NEXT: smov w8, v0.b[0]
+; CHECK-SD-NEXT: smov w9, v0.b[1]
+; CHECK-SD-NEXT: neg w8, w8
+; CHECK-SD-NEXT: neg w9, w9
+; CHECK-SD-NEXT: fmov s1, w8
+; CHECK-SD-NEXT: smov w8, v0.b[2]
+; CHECK-SD-NEXT: mov v1.b[1], w9
+; CHECK-SD-NEXT: smov w9, v0.b[3]
+; CHECK-SD-NEXT: mov v1.b[2], w8
+; CHECK-SD-NEXT: neg w8, w9
+; CHECK-SD-NEXT: smov w9, v0.b[4]
+; CHECK-SD-NEXT: mov v1.b[3], w8
+; CHECK-SD-NEXT: neg w8, w9
+; CHECK-SD-NEXT: smov w9, v0.b[5]
+; CHECK-SD-NEXT: mov v1.b[4], w8
+; CHECK-SD-NEXT: neg w8, w9
+; CHECK-SD-NEXT: smov w9, v0.b[7]
+; CHECK-SD-NEXT: mov v1.b[5], w8
+; CHECK-SD-NEXT: smov w8, v0.b[6]
+; CHECK-SD-NEXT: mov v1.b[6], w8
+; CHECK-SD-NEXT: neg w8, w9
+; CHECK-SD-NEXT: smov w9, v0.b[8]
+; CHECK-SD-NEXT: mov v1.b[7], w8
+; CHECK-SD-NEXT: neg w8, w9
; CHECK-SD-NEXT: mov v1.b[8], w8
; CHECK-SD-NEXT: smov w8, v0.b[9]
; CHECK-SD-NEXT: mov v1.b[9], w8
@@ -1154,18 +1153,17 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform7:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: umov w9, v0.h[0]
-; CHECK-SD-NEXT: mov w8, wzr
-; CHECK-SD-NEXT: umov w10, v0.h[1]
-; CHECK-SD-NEXT: umov w11, v0.h[2]
-; CHECK-SD-NEXT: sub w9, w8, w9, sxth
-; CHECK-SD-NEXT: sub w10, w8, w10, sxth
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: sub w9, w8, w11, sxth
-; CHECK-SD-NEXT: mov v1.h[1], w10
-; CHECK-SD-NEXT: umov w10, v0.h[3]
-; CHECK-SD-NEXT: mov v1.h[2], w9
-; CHECK-SD-NEXT: sub w8, w8, w10, sxth
+; CHECK-SD-NEXT: smov w8, v0.h[0]
+; CHECK-SD-NEXT: smov w9, v0.h[1]
+; CHECK-SD-NEXT: neg w8, w8
+; CHECK-SD-NEXT: neg w9, w9
+; CHECK-SD-NEXT: fmov s1, w8
+; CHECK-SD-NEXT: smov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v1.h[1], w9
+; CHECK-SD-NEXT: smov w9, v0.h[3]
+; CHECK-SD-NEXT: neg w8, w8
+; CHECK-SD-NEXT: mov v1.h[2], w8
+; CHECK-SD-NEXT: neg w8, w9
; CHECK-SD-NEXT: mov v1.h[3], w8
; CHECK-SD-NEXT: smov w8, v0.h[4]
; CHECK-SD-NEXT: mov v1.h[4], w8
diff --git a/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
new file mode 100644
index 0000000000000..94cd2b56a3655
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extend_vecreduce_add.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -start-before=codegenprepare | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_reduce_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: smov w0, v0.b[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i8> %a, %b
+ %t1 = sext <8 x i1> %mask to <8 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %t1)
+ %t3 = sext i8 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: smov w0, v0.b[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = sext <16 x i1> %mask to <16 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+ %t3 = sext i8 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: smov w0, v0.h[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = sext <4 x i1> %mask to <4 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+ %t3 = sext i16 %t2 to i32
+ ret i32 %t3
+}
+
+define i32 @vmask_reduce_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: smov w0, v0.h[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i16> %a, %b
+ %t1 = sext <8 x i1> %mask to <8 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %t1)
+ %t3 = sext i16 %t2 to i32
+ ret i32 %t3
+}
+
+define i64 @vmask_reduce_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_reduce_i32_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: smov x0, v0.s[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <2 x i32> %a, %b
+ %t1 = sext <2 x i1> %mask to <2 x i32>
+ %t2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %t1)
+ %t3 = sext i32 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: smov x0, v0.b[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i8> %a, %b
+ %t1 = sext <8 x i1> %mask to <8 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %t1)
+ %t3 = sext i8 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: smov x0, v0.b[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = sext <16 x i1> %mask to <16 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+ %t3 = sext i8 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: smov x0, v0.h[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = sext <4 x i1> %mask to <4 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+ %t3 = sext i16 %t2 to i64
+ ret i64 %t3
+}
+
+define i64 @vmask_reduce_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: smov x0, v0.h[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <8 x i16> %a, %b
+ %t1 = sext <8 x i1> %mask to <8 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %t1)
+ %t3 = sext i16 %t2 to i64
+ ret i64 %t3
+}
+
+; TODO: We should use a saddlv here to avoid the smov.
+define i64 @vmask_reduce_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_reduce_i64_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: smov x0, v0.s[0]
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i32> %a, %b
+ %t1 = sext <4 x i1> %mask to <4 x i32>
+ %t2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %t1)
+ %t3 = sext i32 %t2 to i64
+ ret i64 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: smov w8, v0.b[0]
+; CHECK-NEXT: neg w0, w8
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = sext <16 x i1> %mask to <16 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+ %t3 = sext i8 %t2 to i32
+ %t4 = sub i32 0, %t3
+ ret i32 %t4
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: smov w8, v0.h[0]
+; CHECK-NEXT: neg w0, w8
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = sext <4 x i1> %mask to <4 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+ %t3 = sext i16 %t2 to i32
+ %t4 = sub i32 0, %t3
+ ret i32 %t4
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: smov x8, v0.b[0]
+; CHECK-NEXT: neg x0, x8
+; CHECK-NEXT: ret
+ %mask = icmp slt <16 x i8> %a, %b
+ %t1 = sext <16 x i1> %mask to <16 x i8>
+ %t2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %t1)
+ %t3 = sext i8 %t2 to i64
+ %t4 = sub i64 0, %t3
+ ret i64 %t4
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: smov x8, v0.h[0]
+; CHECK-NEXT: neg x0, x8
+; CHECK-NEXT: ret
+ %mask = icmp slt <4 x i16> %a, %b
+ %t1 = sext <4 x i1> %mask to <4 x i16>
+ %t2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %t1)
+ %t3 = sext i16 %t2 to i64
+ %t4 = sub i64 0, %t3
+ ret i64 %t4
+}
More information about the llvm-commits
mailing list