[llvm] [AArch64][SelectionDAG] use fmov for constant forming of {1.0,0.0} (PR #189921)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 02:07:28 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Shreeyash Pandey (bojle)
<details>
<summary>Changes</summary>
Fixes https://github.com/llvm/llvm-project/issues/81377
Implements a check in ConstantBuildVector and emits a insert_vector SDNode into the graph. A tablegen pattern to identify this insert_vector with arguments of the form {1.0, 0.0} and lower it to fmov has also been added.
---
Full diff: https://github.com/llvm/llvm-project/pull/189921.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+114)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+16)
- (modified) llvm/test/CodeGen/AArch64/arm64-build-vector.ll (+66)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 38db1ac4a2fb9..93715aa9ad18f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16111,6 +16111,120 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
return R;
+ // If the low 32 or 64 bits are a non-zero FP constant and all upper bits
+ // are zero, lower to a scalar FMOV inserted into a zero vector. On
+ // AArch64, writing a scalar FP register (s0 or d0) automatically zeroes
+ // the upper bits of the enclosing vector register, so no explicit zeroing
+ // of the upper lanes is needed.
+ //
+ // We check the flat DefBits APInt directly, which handles both the
+ // integer-typed vector case (FP BUILD_VECTORs are bitcast to integer
+ // before reaching here) and the general case.
+ //
+ // We require at least 2 elements so there is at least one upper lane to
+ // zero, and to avoid an infinite loop: for a single-element vector
+ // (e.g. v1f64 or v1i64), INSERT_VECTOR_ELT would be lowered back to
+ // BUILD_VECTOR, re-entering this function.
+ //
+ // We also require the element size to be exactly 32 or 64 bits so that
+ // the non-zero DefBits in the low lane correspond to a single f32/f64
+ // lane 0 value. Smaller element types (e.g. v16i8, v8i16) can have
+ // non-zero bits in the low 32 bits that span multiple sub-32-bit lanes,
+ // which would be misinterpreted as an f32 bit-pattern.
+ if (VT.getVectorNumElements() >= 2) {
+ unsigned TotalBits = VT.getSizeInBits();
+ unsigned EltBits = VT.getVectorElementType().getSizeInBits();
+ SDLoc DL(Op);
+ MVT FpEltVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ APInt LaneBits;
+ // True when Lo64 encodes two equal f32 lanes (v4f32 <fpimm,fpimm,0,0>);
+ // emit fmov v0.2s rather than fmov s0 in that case.
+ bool SplatV2f32 = false;
+
+ // 128-bit vector: check low-32-bits non-zero, high-96-bits zero -> fmov
+ // s0 or low-64-bits non-zero, high-64-bits zero -> fmov d0. 64-bit
+ // vector: check low-32-bits non-zero, high-32-bits zero -> fmov s0. In
+ // each case the element size must match the FP lane size so that the
+ // non-zero bits are confined to exactly lane 0 of the FP type.
+ if (TotalBits == 128) {
+ APInt Lo32 = DefBits.extractBits(32, 0);
+ APInt Hi96 = DefBits.extractBits(96, 32);
+ APInt Lo64 = DefBits.extractBits(64, 0);
+ APInt Hi64 = DefBits.extractBits(64, 64);
+ if (EltBits == 32 && !Lo32.isZero() && Hi96.isZero()) {
+ FpEltVT = MVT::f32;
+ LaneBits = Lo32;
+ } else if (EltBits == 64 && !Lo64.isZero() && Hi64.isZero()) {
+ // The DAG canonicalizes v4f32 constants to v2i64, so Lo64 may
+ // encode one or two f32 values rather than a single f64.
+ APInt Lo64Lo32 = Lo64.extractBits(32, 0);
+ APInt Lo64Hi32 = Lo64.extractBits(32, 32);
+ if (AArch64_AM::getFP64Imm(Lo64) != -1) {
+ // Native f64 immediate: <fpimm64, 0.0>.
+ FpEltVT = MVT::f64;
+ LaneBits = Lo64;
+ } else if (Lo64Hi32.isZero() &&
+ AArch64_AM::getFP32Imm(Lo64Lo32) != -1) {
+ // v4f32 <fpimm32, 0, 0, 0>: only lane 0 is non-zero.
+ FpEltVT = MVT::f32;
+ LaneBits = Lo64Lo32;
+ } else if (Lo64Hi32 == Lo64Lo32 &&
+ AArch64_AM::getFP32Imm(Lo64Lo32) != -1) {
+ // v4f32 <fpimm32, fpimm32, 0, 0>: low two lanes are equal.
+ FpEltVT = MVT::f32;
+ LaneBits = Lo64Lo32;
+ SplatV2f32 = true;
+ }
+ }
+ } else if (TotalBits == 64) {
+ APInt Lo32 = DefBits.extractBits(32, 0);
+ APInt Hi32 = DefBits.extractBits(32, 32);
+ if (EltBits == 32 && !Lo32.isZero() && Hi32.isZero()) {
+ FpEltVT = MVT::f32;
+ LaneBits = Lo32;
+ }
+ }
+
+ // Only proceed if the bit-pattern is a valid 8-bit AArch64 FP
+ // immediate. Non-FP constants (e.g. <i64 -1, i64 0>) can satisfy
+ // the bit-range checks above but cannot be encoded as fmov immediates,
+ // so emitting INSERT_VECTOR_ELT for them would produce worse code.
+ if (FpEltVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+ APFloat FPVal(FpEltVT == MVT::f32 ? APFloat::IEEEsingle()
+ : APFloat::IEEEdouble(),
+ LaneBits);
+ if ((FpEltVT == MVT::f32 && AArch64_AM::getFP32Imm(LaneBits) == -1) ||
+ (FpEltVT == MVT::f64 && AArch64_AM::getFP64Imm(LaneBits) == -1))
+ FpEltVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ }
+ if (FpEltVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+ APFloat FPVal(FpEltVT == MVT::f32 ? APFloat::IEEEsingle()
+ : APFloat::IEEEdouble(),
+ LaneBits);
+ SDValue FpScalar = DAG.getConstantFP(FPVal, DL, FpEltVT);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ if (SplatV2f32) {
+ // Emit a v2f32 splat (fmov v.2s) inserted into the low half of a
+ // zero v4f32. Writing d0 (FMOVv2f32_ns) zeroes the upper 64 bits
+ // of q0, correctly materialising <fpimm,fpimm,0,0> without a load.
+ SDValue Splat = DAG.getNode(
+ AArch64ISD::FMOV, DL, MVT::v2f32,
+ DAG.getConstant(AArch64_AM::getFP32Imm(LaneBits), DL, MVT::i32));
+ SDValue ZeroV4F32 = DAG.getBitcast(MVT::v4f32, Zero);
+ SDValue Ins =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v4f32, ZeroV4F32,
+ Splat, DAG.getVectorIdxConstant(0, DL));
+ return DAG.getBitcast(VT, Ins);
+ }
+ unsigned NumElts = TotalBits / FpEltVT.getSizeInBits();
+ MVT FpVT = MVT::getVectorVT(FpEltVT, NumElts);
+ SDValue Inserted = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, FpVT,
+ DAG.getBitcast(FpVT, Zero), FpScalar,
+ DAG.getConstant(0, DL, MVT::i64));
+ return DAG.getBitcast(VT, Inserted);
+ }
+ }
+
// See if a fneg of the constant can be materialized with a MOVI, etc
auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
// FNegate each sub-element of the constant
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 08512f6ed8df1..8f7866c774d6e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8693,6 +8693,22 @@ def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
} // Predicates = [HasNEON, HasFullFP16]
}
+// vector_insert(zeros, fpimm, 0) -> SUBREG_TO_REG(FMOVSi/FMOVDi, ssub/dsub)
+// On AArch64, writing a scalar FP register zeroes the upper bits of the
+// enclosing vector register, so no explicit zeroing of the upper lanes is needed.
+def : Pat<(v2f32 (vector_insert (v2f32 immAllZerosV), (f32 fpimm32:$imm), (i64 0))),
+ (SUBREG_TO_REG (FMOVSi (fpimm32XForm f32:$imm)), ssub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 immAllZerosV), (f32 fpimm32:$imm), (i64 0))),
+ (SUBREG_TO_REG (FMOVSi (fpimm32XForm f32:$imm)), ssub)>;
+def : Pat<(v2f64 (vector_insert (v2f64 immAllZerosV), (f64 fpimm64:$imm), (i64 0))),
+ (SUBREG_TO_REG (FMOVDi (fpimm64XForm f64:$imm)), dsub)>;
+
+// v4f32 <fpimm,fpimm,0,0>: concat v2f32 splat with zero v2f32.
+// Writing d0 (FMOVv2f32_ns) zeroes the upper 64 bits of q0.
+def : Pat<(v4f32 (concat_vectors (v2f32 (AArch64fmov imm0_255:$imm)),
+ (v2f32 (bitconvert (i64 0))))),
+ (SUBREG_TO_REG (FMOVv2f32_ns imm0_255:$imm), dsub)>;
+
// AdvSIMD MOVI
// EDIT byte mask: scalar
diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index 914f431866cce..a55c57ebf14d9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -329,3 +329,69 @@ define <4 x bfloat> @negzero_v4bf16(<4 x bfloat> %a) {
; CHECK-NEXT: ret
ret <4 x bfloat> <bfloat -0.0, bfloat -0.0, bfloat -0.0, bfloat -0.0>
}
+
+; Test that BUILD_VECTOR with an FP constant in lane 0 and +0.0 in upper lanes
+; lowers to a scalar FMOV. On AArch64, writing a scalar FP register zeroes the
+; upper bits of the enclosing vector register.
+
+define <2 x float> @fmov_lane0_zero_upper_v2f32() {
+; CHECK-LABEL: fmov_lane0_zero_upper_v2f32:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, #1.00000000
+; CHECK-GI-NEXT: adrp x8, .LCPI23_0
+; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT: ret
+ ret <2 x float> <float 1.0, float 0.0>
+}
+
+define <2 x double> @fmov_lane0_zero_upper_v2f64() {
+; CHECK-LABEL: fmov_lane0_zero_upper_v2f64:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, #1.00000000
+; CHECK-GI-NEXT: adrp x8, .LCPI24_0
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI24_0]
+; CHECK-NEXT: ret
+ ret <2 x double> <double 1.0, double 0.0>
+}
+
+define <2 x float> @fmov_lane0_zero_upper_v2f32_half() {
+; CHECK-LABEL: fmov_lane0_zero_upper_v2f32_half:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, #0.50000000
+; CHECK-GI-NEXT: adrp x8, .LCPI25_0
+; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI25_0]
+; CHECK-NEXT: ret
+ ret <2 x float> <float 0.5, float 0.0>
+}
+
+define <2 x float> @fmov_lane0_zero_upper_v2f32_four() {
+; CHECK-LABEL: fmov_lane0_zero_upper_v2f32_four:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, #4.00000000
+; CHECK-GI-NEXT: adrp x8, .LCPI26_0
+; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI26_0]
+; CHECK-NEXT: ret
+ ret <2 x float> <float 4.0, float 0.0>
+}
+
+define <4 x float> @fmov_lane0_zero_upper_v4f32_two() {
+; CHECK-LABEL: fmov_lane0_zero_upper_v4f32_two:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, #2.00000000
+; CHECK-GI-NEXT: adrp x8, .LCPI27_0
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI27_0]
+; CHECK-NEXT: ret
+ ret <4 x float> <float 2.0, float 0.0, float 0.0, float 0.0>
+}
+
+; v4f32 <fpimm, fpimm, 0, 0>: low two lanes equal -> fmov v0.2s writes d0,
+; zeroing the upper 64 bits of q0.
+define <4 x float> @fmov_lane0_lane1_zero_upper_v4f32_two() {
+; CHECK-LABEL: fmov_lane0_lane1_zero_upper_v4f32_two:
+; CHECK: // %bb.0:
+; CHECK-SD-NEXT: fmov v0.2s, #2.00000000
+; CHECK-GI-NEXT: adrp x8, .LCPI28_0
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI28_0]
+; CHECK-NEXT: ret
+ ret <4 x float> <float 2.0, float 2.0, float 0.0, float 0.0>
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/189921
More information about the llvm-commits
mailing list