[llvm] [AArch64] Fold zero-high vector inserts in MI peephole optimisation (PR #182835)
Amina Chabane via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 01:35:15 PST 2026
https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/182835
>From 9fcff0076b13cdd0209d80e422f5b1dd713636c6 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 19 Feb 2026 17:48:12 +0000
Subject: [PATCH 1/3] [AArch64] Fold zero-high vector inserts in MI peephole
optimisation
Summary
This patch follows on from #178227.
The previous ISel fold lowers the 64-bit case to:
fmov d0, x0
fmov d0, d0
which is not ideal and could be fmov d0, x0.
A redundant copy comes from the INSERT_SUBREG/INSvi64lane.
This peephole detects <2 x i64> vectors made of a zeroed upper and low
lane produced by FMOVXDr/FMOVDr, then removes the redundant copy.
Further updated tests and added MIR tests.
---
.../Target/AArch64/AArch64MIPeepholeOpt.cpp | 51 +++++++++++++++++--
llvm/test/CodeGen/AArch64/aarch64-addv.ll | 12 ++---
.../AArch64/aarch64-matrix-umull-smull.ll | 15 +++---
llvm/test/CodeGen/AArch64/bitcast-extend.ll | 4 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 1 -
llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 48 ++++++++---------
.../AArch64/neon-lowhalf128-optimisation.ll | 1 -
.../CodeGen/AArch64/peephole-insvigpr.mir | 51 +++++++++++++++++++
8 files changed, 137 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 21ff921da9b8a..398273babe1b1 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -687,14 +687,57 @@ bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
}
// All instructions that set a FPR64 will implicitly zero the top bits of the
-// register.
+// register. When the def is expressed as a COPY from a GPR, turn it into an
+// explicit FMOV so it cannot be elided later in further passes.
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
- MachineRegisterInfo *MRI) {
+ MachineRegisterInfo *MRI,
+ const AArch64InstrInfo *TII) {
if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
return false;
const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
if (RC != &AArch64::FPR64RegClass)
return false;
+ if (MI->getOpcode() == TargetOpcode::COPY) {
+ MachineOperand &SrcOp = MI->getOperand(1);
+ if (!SrcOp.isReg())
+ return false;
+ if (SrcOp.getSubReg())
+ return false;
+ Register SrcReg = SrcOp.getReg();
+ auto IsGPR64Like = [&]() -> bool {
+ if (SrcReg.isVirtual())
+ return AArch64::GPR64allRegClass.hasSubClassEq(
+ MRI->getRegClass(SrcReg));
+ return AArch64::GPR64allRegClass.contains(SrcReg);
+ };
+ if (!IsGPR64Like())
+ return false;
+ assert(TII && "Expected InstrInfo when materializing COPYs");
+ // FMOVXDr insists on strict GPR64 operands, so fix up the COPY source.
+ MachineOperand &SrcMO = MI->getOperand(1);
+ bool SrcKill = SrcMO.isKill();
+ if (SrcReg.isVirtual()) {
+ if (MRI->getRegClass(SrcReg) != &AArch64::GPR64RegClass) {
+ // Pass the value through a temporary GPR64 vreg to satisfy the
+ // verifier.
+ Register NewSrc = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), NewSrc)
+ .addReg(SrcReg, getKillRegState(SrcKill));
+ SrcReg = NewSrc;
+ SrcKill = true;
+ }
+ } else if (!AArch64::GPR64RegClass.contains(SrcReg)) {
+ return false;
+ }
+ SrcMO.setReg(SrcReg);
+ SrcMO.setSubReg(0);
+ SrcMO.setIsKill(SrcKill);
+ // Replace the COPY with an explicit FMOV so the zeroing behaviour stays
+ // visible.
+ MI->setDesc(TII->get(AArch64::FMOVXDr));
+ return true;
+ }
return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
}
@@ -710,7 +753,7 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
return false;
Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
- if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
+ if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI, TII))
return false;
// Check there is `mov 0` MI for high 64-bits.
@@ -751,7 +794,7 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
// An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
- if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
+ if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI, TII))
return false;
// Let's remove MIs for high 64-bits.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 9b60e0bde5a90..de68a79824eb3 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -543,8 +543,8 @@ define i8 @addv_zero_lanes_negative_v8i8(ptr %arr) {
define i8 @addv_zero_lanes_v16i8(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v16i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr b0, [x0]
-; CHECK-SD-NEXT: fmov d0, d0
+; CHECK-SD-NEXT: ldrb w8, [x0]
+; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
@@ -567,8 +567,8 @@ define i8 @addv_zero_lanes_v16i8(ptr %arr) {
define i16 @addv_zero_lanes_v8i16(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v8i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr h0, [x0]
-; CHECK-SD-NEXT: fmov d0, d0
+; CHECK-SD-NEXT: ldrh w8, [x0]
+; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
@@ -591,8 +591,8 @@ define i16 @addv_zero_lanes_v8i16(ptr %arr) {
define i32 @addv_zero_lanes_v4i32(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v4i32:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr s0, [x0]
-; CHECK-SD-NEXT: fmov d0, d0
+; CHECK-SD-NEXT: ldr w8, [x0]
+; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 578a96aee6d59..ea55c198a70f1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -823,14 +823,13 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-SD-NEXT: cbz x11, .LBB6_13
; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph
; CHECK-SD-NEXT: mov w11, w1
-; CHECK-SD-NEXT: fmov d0, x8
-; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT: sxtb x8, w11
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT: sxtb x11, w11
+; CHECK-SD-NEXT: fmov d2, x8
+; CHECK-SD-NEXT: dup v1.2s, w11
; CHECK-SD-NEXT: mov x11, x10
; CHECK-SD-NEXT: and x10, x9, #0xfffffffc
-; CHECK-SD-NEXT: fmov d0, d0
-; CHECK-SD-NEXT: dup v2.2s, w8
; CHECK-SD-NEXT: sub x8, x11, x10
; CHECK-SD-NEXT: add x11, x0, x11
; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body
@@ -845,11 +844,11 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b
; CHECK-SD-NEXT: xtn v5.2s, v5.2d
; CHECK-SD-NEXT: xtn v4.2s, v4.2d
-; CHECK-SD-NEXT: smlal v1.2d, v2.2s, v4.2s
-; CHECK-SD-NEXT: smlal v0.2d, v2.2s, v5.2s
+; CHECK-SD-NEXT: smlal v0.2d, v1.2s, v4.2s
+; CHECK-SD-NEXT: smlal v2.2d, v1.2s, v5.2s
; CHECK-SD-NEXT: b.ne .LBB6_11
; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block
-; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-SD-NEXT: cmp x10, x9
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index c68e55d7a8aef..b981c1701725a 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -339,8 +339,8 @@ define <8 x i8> @load_sext_i32_v8i8(ptr %p) {
define <16 x i8> @load_zext_v16i8(ptr %p) {
; CHECK-SD-LABEL: load_zext_v16i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr s0, [x0]
-; CHECK-SD-NEXT: fmov d0, d0
+; CHECK-SD-NEXT: ldr w8, [x0]
+; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: load_zext_v16i8:
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index c2490c1e1b331..24fff7506731f 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -602,7 +602,6 @@ define i128 @i128_mask(i128 %x) {
; CHECK-SD-NEXT: and x8, x0, #0xff
; CHECK-SD-NEXT: mov x1, xzr
; CHECK-SD-NEXT: fmov d0, x8
-; CHECK-SD-NEXT: fmov d0, d0
; CHECK-SD-NEXT: cnt v0.16b, v0.16b
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 6d94bb08f2e79..19494205f4155 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -684,9 +684,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -713,9 +713,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
@@ -1151,9 +1151,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -1181,9 +1181,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
@@ -1603,9 +1603,9 @@ define <2 x i64> @utest_f16i64(<2 x half> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -1633,9 +1633,9 @@ define <2 x i64> @utest_f16i64(<2 x half> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
@@ -2484,9 +2484,9 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -2513,9 +2513,9 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
@@ -2942,9 +2942,9 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -2972,9 +2972,9 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
@@ -3385,9 +3385,9 @@ define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
; CHECK-CVT-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-CVT-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-CVT-SD-NEXT: cmp x20, #0
-; CHECK-CVT-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-CVT-SD-NEXT: fmov d0, x8
-; CHECK-CVT-SD-NEXT: fmov d1, x9
+; CHECK-CVT-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-CVT-SD-NEXT: fmov d1, x8
; CHECK-CVT-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-CVT-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-CVT-SD-NEXT: add sp, sp, #48
@@ -3415,9 +3415,9 @@ define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
; CHECK-FP16-SD-NEXT: ldr x30, [sp, #16] // 8-byte Reload
; CHECK-FP16-SD-NEXT: csel x8, x0, xzr, eq
; CHECK-FP16-SD-NEXT: cmp x20, #0
-; CHECK-FP16-SD-NEXT: csel x9, x19, xzr, eq
; CHECK-FP16-SD-NEXT: fmov d0, x8
-; CHECK-FP16-SD-NEXT: fmov d1, x9
+; CHECK-FP16-SD-NEXT: csel x8, x19, xzr, eq
+; CHECK-FP16-SD-NEXT: fmov d1, x8
; CHECK-FP16-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-FP16-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-FP16-SD-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll b/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll
index 4e30813187fec..38be2992c8211 100644
--- a/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll
+++ b/llvm/test/CodeGen/AArch64/neon-lowhalf128-optimisation.ll
@@ -5,7 +5,6 @@ define <2 x i64> @low_vector_splat_v2i64_from_i64(i64 %0){
; CHECK-LABEL: low_vector_splat_v2i64_from_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
%2 = insertelement <1 x i64> poison, i64 %0, i64 0
%3 = shufflevector <1 x i64> %2, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
index aef01e42ed7cc..a68eda11d5ca1 100644
--- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
@@ -41,6 +41,11 @@
ret void
}
+ define void @insert_vec_from_gpr64_zero_high(i64 %v, ptr %dst) {
+ entry:
+ ret void
+ }
+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
...
@@ -521,4 +526,50 @@ body: |
STRSui killed %16, %0, 0 :: (store (s32) into %ir.hist)
RET_ReallyLR
+---
+name: insert_vec_from_gpr64_zero_high
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gpr64common, preferred-register: '' }
+ - { id: 1, class: gpr64common, preferred-register: '' }
+ - { id: 2, class: fpr64, preferred-register: '' }
+ - { id: 3, class: fpr128, preferred-register: '' }
+ - { id: 4, class: fpr128, preferred-register: '' }
+ - { id: 5, class: fpr64, preferred-register: '' }
+ - { id: 6, class: fpr128, preferred-register: '' }
+ - { id: 7, class: fpr128, preferred-register: '' }
+ - { id: 8, class: fpr128, preferred-register: '' }
+liveins:
+ - { reg: '$x0', virtual-reg: '%0' }
+ - { reg: '$x1', virtual-reg: '%1' }
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1
+
+ ; CHECK-LABEL: name: insert_vec_from_gpr64_zero_high
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PTR:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[VAL:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[GPR:%[0-9]+]]:gpr64 = COPY [[VAL]]
+ ; CHECK-NEXT: [[FMOV:%[0-9]+]]:fpr64 = FMOVXDr killed [[GPR]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_LOW:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[FMOV]], %subreg.dsub
+ ; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 0
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_ZERO:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], killed [[MOVID]], %subreg.dsub
+ ; CHECK-NEXT: STRQui killed [[INSERT_LOW]], [[PTR]], 0 :: (store (s128) into %ir.dst, align 8)
+ ; CHECK-NEXT: RET_ReallyLR
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:fpr64 = COPY %1
+ %4:fpr128 = IMPLICIT_DEF
+ %3:fpr128 = INSERT_SUBREG %4, %2, %subreg.dsub
+ %5:fpr64 = MOVID 0
+ %7:fpr128 = IMPLICIT_DEF
+ %6:fpr128 = INSERT_SUBREG %7, killed %5, %subreg.dsub
+ %8:fpr128 = INSvi64lane %3, 1, killed %6, 0
+ STRQui killed %8, %0, 0 :: (store (s128) into %ir.dst, align 8)
+ RET_ReallyLR
+
...
>From 8778f23015b3ecfb24f0a673ef772bce59a25b3a Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 23 Feb 2026 12:18:15 +0000
Subject: [PATCH 2/3] Compile time regression test
---
.../aarch64-neonvector-tensorflow-regression.ll | 13 +++++++++++++
1 file changed, 13 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll b/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
new file mode 100644
index 0000000000000..716508fb78246
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -start-before=aarch64-isel %s -o /dev/null
+; Regression test for AArch64 compile-time regression, referring to PR #166962.
+
+define fastcc <2 x i64> @_ZN10tensorflow12_GLOBAL__N_125ComputeXWeightsAndIndicesERKNS_17ImageResizerStateEbPNSt3__u6vectorINS0_17WeightsAndIndicesENS4_9allocatorIS6_EEEE(<2 x i64> %0) {
+entry:
+ %1 = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> <i64 -1, i64 0>)
+ ret <2 x i64> %1
+}
+
+; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) #0
+
+attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
\ No newline at end of file
>From 55d4280c03439a8e09494755d09737d1b44dd627 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 25 Feb 2026 17:54:58 +0000
Subject: [PATCH 3/3] Update tests, remove regression test
---
llvm/test/CodeGen/AArch64/aarch64-addv.ll | 12 ++++++------
.../aarch64-neonvector-tensorflow-regression.ll | 13 -------------
llvm/test/CodeGen/AArch64/bitcast-extend.ll | 4 ++--
3 files changed, 8 insertions(+), 21 deletions(-)
delete mode 100644 llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index de68a79824eb3..9b60e0bde5a90 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -543,8 +543,8 @@ define i8 @addv_zero_lanes_negative_v8i8(ptr %arr) {
define i8 @addv_zero_lanes_v16i8(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v16i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ldr b0, [x0]
+; CHECK-SD-NEXT: fmov d0, d0
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
@@ -567,8 +567,8 @@ define i8 @addv_zero_lanes_v16i8(ptr %arr) {
define i16 @addv_zero_lanes_v8i16(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v8i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: fmov d0, d0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
@@ -591,8 +591,8 @@ define i16 @addv_zero_lanes_v8i16(ptr %arr) {
define i32 @addv_zero_lanes_v4i32(ptr %arr) {
; CHECK-SD-LABEL: addv_zero_lanes_v4i32:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr w8, [x0]
-; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: fmov d0, d0
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll b/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
deleted file mode 100644
index 716508fb78246..0000000000000
--- a/llvm/test/CodeGen/AArch64/aarch64-neonvector-tensorflow-regression.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -start-before=aarch64-isel %s -o /dev/null
-; Regression test for AArch64 compile-time regression, referring to PR #166962.
-
-define fastcc <2 x i64> @_ZN10tensorflow12_GLOBAL__N_125ComputeXWeightsAndIndicesERKNS_17ImageResizerStateEbPNSt3__u6vectorINS0_17WeightsAndIndicesENS4_9allocatorIS6_EEEE(<2 x i64> %0) {
-entry:
- %1 = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> <i64 -1, i64 0>)
- ret <2 x i64> %1
-}
-
-; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
-declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) #0
-
-attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index b981c1701725a..c68e55d7a8aef 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -339,8 +339,8 @@ define <8 x i8> @load_sext_i32_v8i8(ptr %p) {
define <16 x i8> @load_zext_v16i8(ptr %p) {
; CHECK-SD-LABEL: load_zext_v16i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr w8, [x0]
-; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: fmov d0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: load_zext_v16i8:
More information about the llvm-commits
mailing list