[llvm] [AArch64][GlobalISel] Improve lowering of vector fp16 fptrunc and fpext (PR #163398)
Ryan Cowan via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 20 03:33:30 PDT 2025
https://github.com/HolyMolyCowMan updated https://github.com/llvm/llvm-project/pull/163398
>From e77ef4537744f6cc100fef83939f04cfaee1f7bb Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan at arm.com>
Date: Mon, 13 Oct 2025 12:14:00 +0000
Subject: [PATCH 1/2] [AArch64][GlobalISel] Improve lowering of vector fp16
fptrunc and fpext
---
llvm/lib/Target/AArch64/AArch64Combine.td | 9 +-
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 62 +++-
.../AArch64/GISel/AArch64LegalizerInfo.h | 2 +
.../GISel/AArch64PostLegalizerLowering.cpp | 194 ++++++++++++
.../GlobalISel/legalizer-info-validation.mir | 8 +-
llvm/test/CodeGen/AArch64/arm64-fp128.ll | 24 +-
llvm/test/CodeGen/AArch64/fmla.ll | 48 +--
.../CodeGen/AArch64/fp16-v4-instructions.ll | 73 +----
.../CodeGen/AArch64/fp16-v8-instructions.ll | 100 ++-----
llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 186 +++++-------
llvm/test/CodeGen/AArch64/fpext.ll | 49 ++-
llvm/test/CodeGen/AArch64/fptoi.ll | 278 ++++++------------
.../test/CodeGen/AArch64/fptosi-sat-vector.ll | 85 ++----
.../test/CodeGen/AArch64/fptoui-sat-vector.ll | 85 ++----
llvm/test/CodeGen/AArch64/fptrunc.ll | 101 +++----
15 files changed, 592 insertions(+), 712 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ecaeff77fcb4b..0c71844e3a73e 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -333,6 +333,13 @@ def combine_mul_cmlt : GICombineRule<
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;
+def lower_fptrunc_fptrunc: GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_FPTRUNC):$root,
+ [{ return matchFpTruncFpTrunc(*${root}, MRI); }]),
+ (apply [{ applyFpTruncFpTrunc(*${root}, MRI, B); }])
+>;
+
// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
@@ -341,7 +348,7 @@ def AArch64PostLegalizerLowering
[shuffle_vector_lowering, vashr_vlshr_imm,
icmp_lowering, build_vector_lowering,
lower_vector_fcmp, form_truncstore, fconstant_to_constant,
- vector_sext_inreg_to_shift,
+ vector_sext_inreg_to_shift, lower_fptrunc_fptrunc,
unmerge_ext_to_unmerge, lower_mulv2s64,
vector_unmerge_lowering, insertelt_nonconst,
unmerge_duplanes]> {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9e2d698e04ae7..fde86449a76a7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -817,14 +818,31 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalFor(
{{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
.libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
- .clampNumElements(0, v4s16, v4s16)
- .clampNumElements(0, v2s32, v2s32)
+ .moreElementsToNextPow2(1)
+ .customIf([](const LegalityQuery &Q) {
+ LLT DstTy = Q.Types[0];
+ LLT SrcTy = Q.Types[1];
+ return SrcTy.isFixedVector() && DstTy.isFixedVector() &&
+ SrcTy.getScalarSizeInBits() == 64 &&
+ DstTy.getScalarSizeInBits() == 16;
+ })
+ // Clamp based on input
+ .clampNumElements(1, v4s32, v4s32)
+ .clampNumElements(1, v2s64, v2s64)
.scalarize(0);
getActionDefinitionsBuilder(G_FPEXT)
.legalFor(
{{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
.libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
+ .moreElementsToNextPow2(0)
+ .customIf([](const LegalityQuery &Q) {
+ LLT DstTy = Q.Types[0];
+ LLT SrcTy = Q.Types[1];
+ return SrcTy.isVector() && DstTy.isVector() &&
+ SrcTy.getScalarSizeInBits() == 16 &&
+ DstTy.getScalarSizeInBits() == 64;
+ })
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
.scalarize(0);
@@ -1472,6 +1490,12 @@ bool AArch64LegalizerInfo::legalizeCustom(
return legalizeICMP(MI, MRI, MIRBuilder);
case TargetOpcode::G_BITCAST:
return legalizeBitcast(MI, Helper);
+ case TargetOpcode::G_FPEXT:
+ // In order to vectorise f16 to f64 properly, we need to use f32 as an
+ // intermediary
+ return legalizeViaF32(MI, MIRBuilder, MRI, TargetOpcode::G_FPEXT);
+ case TargetOpcode::G_FPTRUNC:
+ return legalizeViaF32(MI, MIRBuilder, MRI, TargetOpcode::G_FPTRUNC);
}
llvm_unreachable("expected switch to return");
@@ -2396,3 +2420,37 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
MI.eraseFromParent();
return true;
}
+
+bool AArch64LegalizerInfo::legalizeViaF32(MachineInstr &MI,
+ MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI,
+ unsigned Opcode) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+
+ LLT MidTy = LLT::fixed_vector(SrcTy.getNumElements(), LLT::scalar(32));
+
+ MachineInstrBuilder Mid;
+ MachineInstrBuilder Fin;
+ MIRBuilder.setInstrAndDebugLoc(MI);
+ switch (Opcode) {
+ default:
+ return false;
+ case TargetOpcode::G_FPEXT: {
+ Mid = MIRBuilder.buildFPExt(MidTy, Src);
+ Fin = MIRBuilder.buildFPExt(DstTy, Mid.getReg(0));
+ break;
+ }
+ case TargetOpcode::G_FPTRUNC: {
+ Mid = MIRBuilder.buildFPTrunc(MidTy, Src);
+ Fin = MIRBuilder.buildFPTrunc(DstTy, Mid.getReg(0));
+ break;
+ }
+ }
+
+ MRI.replaceRegWith(Dst, Fin.getReg(0));
+ MI.eraseFromParent();
+ return true;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index bcb294326fa92..049808d66f983 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -67,6 +67,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizePrefetch(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeBitcast(MachineInstr &MI, LegalizerHelper &Helper) const;
+ bool legalizeViaF32(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, unsigned Opcode) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 23dcaea2ac1a4..e675fac0f13ac 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -901,6 +901,200 @@ unsigned getCmpOperandFoldingProfit(Register CmpOp, MachineRegisterInfo &MRI) {
return 0;
}
+// Helper function for matchFpTruncFpTrunc.
+// Checks that the given definition belongs to an FPTRUNC and that the source is
+// not an integer, as no rounding is necessary due to the range of values
+bool checkTruncSrc(MachineRegisterInfo &MRI, MachineInstr *MaybeFpTrunc) {
+ if (!MaybeFpTrunc || MaybeFpTrunc->getOpcode() != TargetOpcode::G_FPTRUNC)
+ return false;
+
+ // Check the source is 64 bits as we only want to match a very specific
+ // pattern
+ Register FpTruncSrc = MaybeFpTrunc->getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(FpTruncSrc);
+ if (SrcTy.getScalarSizeInBits() != 64)
+ return false;
+
+ // Need to check the float didn't come from an int as no rounding is
+ // neccessary
+ MachineInstr *FpTruncSrcDef = getDefIgnoringCopies(FpTruncSrc, MRI);
+ if (FpTruncSrcDef->getOpcode() == TargetOpcode::G_SITOFP ||
+ FpTruncSrcDef->getOpcode() == TargetOpcode::G_UITOFP)
+ return false;
+
+ return true;
+}
+
+// To avoid double rounding issues we need to lower FPTRUNC(FPTRUNC) to an odd
+// rounding truncate and a normal truncate. When
+// truncating an FP that came from an integer this is not a problem as the range
+// of values is lower in the int
+bool matchFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ if (MI.getOpcode() != TargetOpcode::G_FPTRUNC)
+ return false;
+
+ // Check the destination is 16 bits as we only want to match a very specific
+ // pattern
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ if (DstTy.getScalarSizeInBits() != 16)
+ return false;
+
+ Register Src = MI.getOperand(1).getReg();
+
+ MachineInstr *ParentDef = getDefIgnoringCopies(Src, MRI);
+ if (!ParentDef)
+ return false;
+
+ MachineInstr *FpTruncDef;
+ switch (ParentDef->getOpcode()) {
+ default:
+ return false;
+ case TargetOpcode::G_CONCAT_VECTORS: {
+ // Expecting exactly two FPTRUNCs
+ if (ParentDef->getNumOperands() != 3)
+ return false;
+
+ // All operands need to be FPTRUNC
+ for (unsigned OpIdx = 1, NumOperands = ParentDef->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx) {
+ Register FpTruncDst = ParentDef->getOperand(OpIdx).getReg();
+
+ FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
+
+ if (!checkTruncSrc(MRI, FpTruncDef))
+ return false;
+ }
+
+ return true;
+ }
+ // This is to match cases in which vectors are widened to a larger size
+ case TargetOpcode::G_INSERT_VECTOR_ELT: {
+ Register VecExtractDst = ParentDef->getOperand(2).getReg();
+ MachineInstr *VecExtractDef = getDefIgnoringCopies(VecExtractDst, MRI);
+
+ Register FpTruncDst = VecExtractDef->getOperand(1).getReg();
+ FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
+
+ if (!checkTruncSrc(MRI, FpTruncDef))
+ return false;
+ break;
+ }
+ case TargetOpcode::G_FPTRUNC: {
+ Register FpTruncDst = ParentDef->getOperand(1).getReg();
+ FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
+
+ if (!checkTruncSrc(MRI, FpTruncDef))
+ return false;
+ break;
+ }
+ }
+
+ return true;
+}
+
+void applyFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ LLT V2F32 = LLT::fixed_vector(2, LLT::scalar(32));
+ LLT V4F32 = LLT::fixed_vector(4, LLT::scalar(32));
+ LLT V4F16 = LLT::fixed_vector(4, LLT::scalar(16));
+
+ B.setInstrAndDebugLoc(MI);
+
+ MachineInstr *ParentDef = getDefIgnoringCopies(Src, MRI);
+ if (!ParentDef)
+ return;
+
+ switch (ParentDef->getOpcode()) {
+ default:
+ return;
+ case TargetOpcode::G_INSERT_VECTOR_ELT: {
+ Register VecExtractDst = ParentDef->getOperand(2).getReg();
+ MachineInstr *VecExtractDef = getDefIgnoringCopies(VecExtractDst, MRI);
+
+ Register FpTruncDst = VecExtractDef->getOperand(1).getReg();
+ MachineInstr *FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
+
+ Register FpTruncSrc = FpTruncDef->getOperand(1).getReg();
+ MRI.setRegClass(FpTruncSrc, &AArch64::FPR128RegClass);
+
+ Register Fp32 = MRI.createGenericVirtualRegister(V2F32);
+ MRI.setRegClass(Fp32, &AArch64::FPR64RegClass);
+
+ B.buildInstr(AArch64::FCVTXNv2f32, {Fp32}, {FpTruncSrc});
+
+ // Only 4f32 -> 4f16 is legal so we need to mimic that situation
+ Register Fp32Padding = B.buildUndef(V2F32).getReg(0);
+ MRI.setRegClass(Fp32Padding, &AArch64::FPR64RegClass);
+
+ Register Fp32Full = MRI.createGenericVirtualRegister(V4F32);
+ MRI.setRegClass(Fp32Full, &AArch64::FPR128RegClass);
+ B.buildConcatVectors(Fp32Full, {Fp32, Fp32Padding});
+
+ Register Fp16 = MRI.createGenericVirtualRegister(V4F16);
+ MRI.setRegClass(Fp16, &AArch64::FPR64RegClass);
+ B.buildFPTrunc(Fp16, Fp32Full);
+
+ MRI.replaceRegWith(Dst, Fp16);
+ MI.eraseFromParent();
+ break;
+ }
+ case TargetOpcode::G_CONCAT_VECTORS: {
+ // Get the two FP Truncs that are being concatenated
+ Register FpTrunc1Dst = ParentDef->getOperand(1).getReg();
+ Register FpTrunc2Dst = ParentDef->getOperand(2).getReg();
+
+ MachineInstr *FpTrunc1Def = getDefIgnoringCopies(FpTrunc1Dst, MRI);
+ MachineInstr *FpTrunc2Def = getDefIgnoringCopies(FpTrunc2Dst, MRI);
+
+ // Make the registers 128bit to store the 2 doubles
+ Register LoFp64 = FpTrunc1Def->getOperand(1).getReg();
+ MRI.setRegClass(LoFp64, &AArch64::FPR128RegClass);
+ Register HiFp64 = FpTrunc2Def->getOperand(1).getReg();
+ MRI.setRegClass(HiFp64, &AArch64::FPR128RegClass);
+
+ B.setInstrAndDebugLoc(MI);
+
+ // Convert the lower half
+ Register LoFp32 = MRI.createGenericVirtualRegister(V2F32);
+ MRI.setRegClass(LoFp32, &AArch64::FPR64RegClass);
+ B.buildInstr(AArch64::FCVTXNv2f32, {LoFp32}, {LoFp64});
+
+ // Create a register for the high half to use
+ Register AccUndef = MRI.createGenericVirtualRegister(V4F32);
+ MRI.setRegClass(AccUndef, &AArch64::FPR128RegClass);
+ B.buildUndef(AccUndef);
+
+ Register Acc = MRI.createGenericVirtualRegister(V4F32);
+ MRI.setRegClass(Acc, &AArch64::FPR128RegClass);
+ B.buildInstr(TargetOpcode::INSERT_SUBREG)
+ .addDef(Acc)
+ .addUse(AccUndef)
+ .addUse(LoFp32)
+ .addImm(AArch64::dsub);
+
+ // Convert the high half
+ Register AccOut = MRI.createGenericVirtualRegister(V4F32);
+ MRI.setRegClass(AccOut, &AArch64::FPR128RegClass);
+ B.buildInstr(AArch64::FCVTXNv4f32)
+ .addDef(AccOut)
+ .addUse(Acc)
+ .addUse(HiFp64);
+
+ Register Fp16 = MRI.createGenericVirtualRegister(V4F16);
+ MRI.setRegClass(Fp16, &AArch64::FPR64RegClass);
+ B.buildFPTrunc(Fp16, AccOut);
+
+ MRI.replaceRegWith(Dst, Fp16);
+ MI.eraseFromParent();
+ break;
+ }
+ }
+}
+
/// \returns true if it would be profitable to swap the LHS and RHS of a G_ICMP
/// instruction \p MI.
bool trySwapICmpOperands(MachineInstr &MI, MachineRegisterInfo &MRI) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 896603d6eb20d..0561f91b6e015 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -555,11 +555,11 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTOSI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index 3e4b887fed55d..b8b8d20b9a17b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1197,30 +1197,22 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) {
;
; CHECK-GI-LABEL: vec_round_f16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sub sp, sp, #64
-; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #48
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT: mov v2.d[1], x8
-; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: bl __trunctfhf2
+; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: fmov d0, d1
+; CHECK-GI-NEXT: add sp, sp, #48
; CHECK-GI-NEXT: ret
%dst = fptrunc <2 x fp128> %val to <2 x half>
ret <2 x half> %dst
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index a37aabb0b5384..12b6562b5cf0c 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -865,22 +865,22 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v4.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v4.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
@@ -1350,22 +1350,22 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT: mov v5.h[0], v2.h[4]
; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v2.h[5]
-; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0]
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT: mov v1.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT: fmul v3.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT: fcvtn v4.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v4.h[0]
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1]
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[2]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v4.h[1]
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v4.h[2]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v3.h[3]
+; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[3]
; CHECK-GI-NOFP16-NEXT: mov v0.h[4], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v1.h[1]
; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[2]
diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 6233ce743b706..760742a4efad7 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -170,47 +170,12 @@ define <4 x half> @s_to_h(<4 x float> %a) {
}
define <4 x half> @d_to_h(<4 x double> %a) {
-; CHECK-CVT-SD-LABEL: d_to_h:
-; CHECK-CVT-SD: // %bb.0:
-; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-CVT-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-CVT-SD-NEXT: ret
-;
-; CHECK-FP16-SD-LABEL: d_to_h:
-; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-FP16-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-FP16-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-FP16-SD-NEXT: ret
-;
-; CHECK-CVT-GI-LABEL: d_to_h:
-; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov d2, v0.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h0, d0
-; CHECK-CVT-GI-NEXT: mov d3, v1.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: fcvt h2, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h2, d3
-; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-CVT-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-CVT-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-CVT-GI-NEXT: ret
-;
-; CHECK-FP16-GI-LABEL: d_to_h:
-; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: mov d2, v0.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h0, d0
-; CHECK-FP16-GI-NEXT: mov d3, v1.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: fcvt h2, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h2, d3
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-FP16-GI-NEXT: ret
+; CHECK-LABEL: d_to_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
%1 = fptrunc <4 x double> %a to <4 x half>
ret <4 x half> %1
}
@@ -241,30 +206,16 @@ define <4 x double> @h_to_d(<4 x half> %a) {
;
; CHECK-CVT-GI-LABEL: h_to_d:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-CVT-GI-NEXT: mov h1, v0.h[1]
-; CHECK-CVT-GI-NEXT: mov h2, v0.h[2]
-; CHECK-CVT-GI-NEXT: mov h3, v0.h[3]
-; CHECK-CVT-GI-NEXT: fcvt d0, h0
-; CHECK-CVT-GI-NEXT: fcvt d4, h1
-; CHECK-CVT-GI-NEXT: fcvt d1, h2
-; CHECK-CVT-GI-NEXT: fcvt d2, h3
-; CHECK-CVT-GI-NEXT: mov v0.d[1], v4.d[0]
-; CHECK-CVT-GI-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-CVT-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-CVT-GI-NEXT: fcvtl2 v1.2d, v1.4s
; CHECK-CVT-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: h_to_d:
; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d4, h1
-; CHECK-FP16-GI-NEXT: fcvt d1, h2
-; CHECK-FP16-GI-NEXT: fcvt d2, h3
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v4.d[0]
-; CHECK-FP16-GI-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s
; CHECK-FP16-GI-NEXT: ret
%1 = fpext <4 x half> %a to <4 x double>
ret <4 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 86763eb5f9e3b..4d8505679c71c 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -198,48 +198,22 @@ define <8 x half> @d_to_h(<8 x double> %a) {
;
; CHECK-CVT-GI-LABEL: d_to_h:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov d4, v0.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h0, d0
-; CHECK-CVT-GI-NEXT: mov d5, v1.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: fcvt h4, d4
-; CHECK-CVT-GI-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h4, d5
-; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-CVT-GI-NEXT: mov d1, v2.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h2, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-CVT-GI-NEXT: mov d2, v3.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h3, d3
-; CHECK-CVT-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h1, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-CVT-GI-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-CVT-GI-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-CVT-GI-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-GI-NEXT: fcvtxn v1.2s, v2.2d
+; CHECK-CVT-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT: fcvtxn2 v1.4s, v3.2d
+; CHECK-CVT-GI-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-CVT-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: d_to_h:
; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: mov d4, v0.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h0, d0
-; CHECK-FP16-GI-NEXT: mov d5, v1.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: fcvt h4, d4
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h4, d5
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov d1, v2.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h2, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-FP16-GI-NEXT: mov d2, v3.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h3, d3
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h1, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-FP16-GI-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-FP16-GI-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-FP16-GI-NEXT: fcvtxn v1.2s, v2.2d
+; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtxn2 v1.4s, v3.2d
+; CHECK-FP16-GI-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-FP16-GI-NEXT: ret
%1 = fptrunc <8 x double> %a to <8 x half>
ret <8 x half> %1
@@ -298,48 +272,22 @@ define <8 x double> @h_to_d(<8 x half> %a) {
;
; CHECK-CVT-GI-LABEL: h_to_d:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov h1, v0.h[1]
-; CHECK-CVT-GI-NEXT: mov h2, v0.h[2]
-; CHECK-CVT-GI-NEXT: mov h3, v0.h[3]
-; CHECK-CVT-GI-NEXT: mov h4, v0.h[4]
-; CHECK-CVT-GI-NEXT: mov h5, v0.h[5]
-; CHECK-CVT-GI-NEXT: mov h6, v0.h[6]
-; CHECK-CVT-GI-NEXT: mov h7, v0.h[7]
-; CHECK-CVT-GI-NEXT: fcvt d0, h0
-; CHECK-CVT-GI-NEXT: fcvt d16, h1
-; CHECK-CVT-GI-NEXT: fcvt d1, h2
-; CHECK-CVT-GI-NEXT: fcvt d17, h3
-; CHECK-CVT-GI-NEXT: fcvt d2, h4
-; CHECK-CVT-GI-NEXT: fcvt d4, h5
-; CHECK-CVT-GI-NEXT: fcvt d3, h6
-; CHECK-CVT-GI-NEXT: fcvt d5, h7
-; CHECK-CVT-GI-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-CVT-GI-NEXT: mov v1.d[1], v17.d[0]
-; CHECK-CVT-GI-NEXT: mov v2.d[1], v4.d[0]
-; CHECK-CVT-GI-NEXT: mov v3.d[1], v5.d[0]
+; CHECK-CVT-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT: fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-CVT-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-CVT-GI-NEXT: fcvtl v2.2d, v3.2s
+; CHECK-CVT-GI-NEXT: fcvtl2 v3.2d, v3.4s
; CHECK-CVT-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: h_to_d:
; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-FP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-FP16-GI-NEXT: mov h6, v0.h[6]
-; CHECK-FP16-GI-NEXT: mov h7, v0.h[7]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d16, h1
-; CHECK-FP16-GI-NEXT: fcvt d1, h2
-; CHECK-FP16-GI-NEXT: fcvt d17, h3
-; CHECK-FP16-GI-NEXT: fcvt d2, h4
-; CHECK-FP16-GI-NEXT: fcvt d4, h5
-; CHECK-FP16-GI-NEXT: fcvt d3, h6
-; CHECK-FP16-GI-NEXT: fcvt d5, h7
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-FP16-GI-NEXT: mov v1.d[1], v17.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v4.d[0]
-; CHECK-FP16-GI-NEXT: mov v3.d[1], v5.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v3.4s, v0.8h
+; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v3.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v3.2d, v3.4s
; CHECK-FP16-GI-NEXT: ret
%1 = fpext <8 x half> %a to <8 x double>
ret <8 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 637c02875b84e..b075a8b6f70ee 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -285,31 +285,24 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: stest_f16i32:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: adrp x8, .LCPI6_1
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI6_1]
; CHECK-FP16-GI-NEXT: adrp x8, .LCPI6_0
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v1.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v1.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b
; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI6_0]
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v0.2d, v2.2d
+; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptosi <4 x half> %x to <4 x i64>
@@ -351,24 +344,17 @@ define <4 x i32> @utest_f16i32(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: utest_f16i32:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d
; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v2.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptoui <4 x half> %x to <4 x i64>
@@ -412,28 +398,21 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: ustest_f16i32:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: cmgt v2.2d, v0.2d, #0
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, #0
-; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-FP16-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-FP16-GI-NEXT: cmgt v1.2d, v2.2d, #0
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, #0
+; CHECK-FP16-GI-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptosi <4 x half> %x to <4 x i64>
@@ -2273,31 +2252,24 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: stest_f16i32_mm:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: adrp x8, .LCPI33_1
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_1]
; CHECK-FP16-GI-NEXT: adrp x8, .LCPI33_0
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v1.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v2.2d, v1.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v2.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b
; CHECK-FP16-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, v2.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v0.2d, v2.2d
+; CHECK-FP16-GI-NEXT: bif v1.16b, v2.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v2.16b, v4.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptosi <4 x half> %x to <4 x i64>
@@ -2337,24 +2309,17 @@ define <4 x i32> @utest_f16i32_mm(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: utest_f16i32_mm:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d
; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmhi v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmhi v4.2d, v1.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v2.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptoui <4 x half> %x to <4 x i64>
@@ -2397,28 +2362,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
;
; CHECK-FP16-GI-LABEL: ustest_f16i32_mm:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
; CHECK-FP16-GI-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d
; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v0.2d
-; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: bit v1.16b, v2.16b, v4.16b
-; CHECK-FP16-GI-NEXT: cmgt v2.2d, v0.2d, #0
-; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, #0
-; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-FP16-GI-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: cmgt v4.2d, v1.2d, v0.2d
+; CHECK-FP16-GI-NEXT: bif v2.16b, v1.16b, v3.16b
+; CHECK-FP16-GI-NEXT: bif v0.16b, v1.16b, v4.16b
+; CHECK-FP16-GI-NEXT: cmgt v1.2d, v2.2d, #0
+; CHECK-FP16-GI-NEXT: cmgt v3.2d, v0.2d, #0
+; CHECK-FP16-GI-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-FP16-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-FP16-GI-NEXT: uzp1 v0.4s, v1.4s, v0.4s
; CHECK-FP16-GI-NEXT: ret
entry:
%conv = fptosi <4 x half> %x to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index df90f9d5f0910..8980340a447de 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -82,11 +82,12 @@ define <3 x double> @fpext_v3f32_v3f64(<3 x float> %a) {
;
; CHECK-GI-LABEL: fpext_v3f32_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[0], v0.s[2]
; CHECK-GI-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-GI-NEXT: fcvt d2, s1
+; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x float> %a to <3 x double>
@@ -320,20 +321,11 @@ entry:
}
define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) {
-; CHECK-SD-LABEL: fpext_v2f16_v2f64:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fpext_v2f16_v2f64:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: fcvt d0, h0
-; CHECK-GI-NEXT: fcvt d1, h1
-; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fpext_v2f16_v2f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: ret
entry:
%c = fpext <2 x half> %a to <2 x double>
ret <2 x double> %c
@@ -353,12 +345,12 @@ define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
;
; CHECK-GI-LABEL: fpext_v3f16_v3f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fcvt d0, h0
-; CHECK-GI-NEXT: fcvt d1, h1
-; CHECK-GI-NEXT: fcvt d2, h2
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-GI-NEXT: fcvtl2 v2.2d, v1.4s
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x half> %a to <3 x double>
@@ -375,16 +367,9 @@ define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) {
;
; CHECK-GI-LABEL: fpext_v4f16_v4f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov h3, v0.h[3]
-; CHECK-GI-NEXT: fcvt d0, h0
-; CHECK-GI-NEXT: fcvt d4, h1
-; CHECK-GI-NEXT: fcvt d1, h2
-; CHECK-GI-NEXT: fcvt d2, h3
-; CHECK-GI-NEXT: mov v0.d[1], v4.d[0]
-; CHECK-GI-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s
; CHECK-GI-NEXT: ret
entry:
%c = fpext <4 x half> %a to <4 x double>
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index f6053cee50dae..3dafabe0b69d7 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -4610,11 +4610,8 @@ define <2 x i64> @fptos_v2f16_v2i64(<2 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptos_v2f16_v2i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v0.2s
; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-FP16-GI-NEXT: ret
entry:
@@ -4654,11 +4651,8 @@ define <2 x i64> @fptou_v2f16_v2i64(<2 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptou_v2f16_v2i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v0.2d, v0.2s
; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-FP16-GI-NEXT: ret
entry:
@@ -4710,20 +4704,14 @@ define <3 x i64> @fptos_v3f16_v3i64(<3 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptos_v3f16_v3i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: fcvt d1, h0
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: fcvt d2, h3
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov d1, v0.d[1]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v1.2d
; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-FP16-GI-NEXT: mov d1, v0.d[1]
+; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptosi <3 x half> %a to <3 x i64>
@@ -4774,20 +4762,14 @@ define <3 x i64> @fptou_v3f16_v3i64(<3 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptou_v3f16_v3i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT: fcvt d1, h0
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-FP16-GI-NEXT: fcvt d2, h3
-; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov d1, v0.d[1]
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v1.2d
; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-FP16-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-FP16-GI-NEXT: mov d1, v0.d[1]
+; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptoui <3 x half> %a to <3 x i64>
@@ -4842,17 +4824,10 @@ define <4 x i64> @fptos_v4f16_v4i64(<4 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptos_v4f16_v4i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v1.2d
; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-FP16-GI-NEXT: ret
entry:
@@ -4908,17 +4883,10 @@ define <4 x i64> @fptou_v4f16_v4i64(<4 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptou_v4f16_v4i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v1.2d
; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-FP16-GI-NEXT: ret
entry:
@@ -5005,29 +4973,16 @@ define <8 x i64> @fptos_v8f16_v8i64(<8 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptos_v8f16_v8i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-FP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-FP16-GI-NEXT: mov h6, v0.h[6]
-; CHECK-FP16-GI-NEXT: mov h7, v0.h[7]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: fcvt d5, h5
-; CHECK-FP16-GI-NEXT: fcvt d6, h6
-; CHECK-FP16-GI-NEXT: fcvt d7, h7
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v4.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v6.2d
+; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-FP16-GI-NEXT: fcvtl v3.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v4.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v2.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v3.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v4.2d
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptosi <8 x half> %a to <8 x i64>
@@ -5113,29 +5068,16 @@ define <8 x i64> @fptou_v8f16_v8i64(<8 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptou_v8f16_v8i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov h1, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h2, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[3]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[4]
-; CHECK-FP16-GI-NEXT: mov h5, v0.h[5]
-; CHECK-FP16-GI-NEXT: mov h6, v0.h[6]
-; CHECK-FP16-GI-NEXT: mov h7, v0.h[7]
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d1, h1
-; CHECK-FP16-GI-NEXT: fcvt d2, h2
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: fcvt d5, h5
-; CHECK-FP16-GI-NEXT: fcvt d6, h6
-; CHECK-FP16-GI-NEXT: fcvt d7, h7
-; CHECK-FP16-GI-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v4.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v6.2d
+; CHECK-FP16-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-FP16-GI-NEXT: fcvtl v2.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-FP16-GI-NEXT: fcvtl v3.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v4.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v2.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v3.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v4.2d
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptoui <8 x half> %a to <8 x i64>
@@ -5285,52 +5227,26 @@ define <16 x i64> @fptos_v16f16_v16i64(<16 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptos_v16f16_v16i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d2, h0
-; CHECK-FP16-GI-NEXT: mov h6, v0.h[4]
-; CHECK-FP16-GI-NEXT: mov h7, v0.h[5]
-; CHECK-FP16-GI-NEXT: mov h16, v0.h[6]
-; CHECK-FP16-GI-NEXT: mov h0, v0.h[7]
-; CHECK-FP16-GI-NEXT: mov h17, v1.h[1]
-; CHECK-FP16-GI-NEXT: mov h18, v1.h[2]
-; CHECK-FP16-GI-NEXT: mov h19, v1.h[3]
-; CHECK-FP16-GI-NEXT: mov h20, v1.h[4]
-; CHECK-FP16-GI-NEXT: mov h21, v1.h[5]
-; CHECK-FP16-GI-NEXT: mov h22, v1.h[6]
-; CHECK-FP16-GI-NEXT: mov h23, v1.h[7]
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: fcvt d5, h5
-; CHECK-FP16-GI-NEXT: fcvt d6, h6
-; CHECK-FP16-GI-NEXT: fcvt d7, h7
-; CHECK-FP16-GI-NEXT: fcvt d16, h16
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d24, h1
-; CHECK-FP16-GI-NEXT: fcvt d1, h17
-; CHECK-FP16-GI-NEXT: fcvt d17, h18
-; CHECK-FP16-GI-NEXT: fcvt d18, h19
-; CHECK-FP16-GI-NEXT: fcvt d19, h20
-; CHECK-FP16-GI-NEXT: fcvt d20, h21
-; CHECK-FP16-GI-NEXT: fcvt d21, h22
-; CHECK-FP16-GI-NEXT: fcvt d22, h23
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-FP16-GI-NEXT: mov v16.d[1], v0.d[0]
-; CHECK-FP16-GI-NEXT: mov v24.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v17.d[1], v18.d[0]
-; CHECK-FP16-GI-NEXT: mov v19.d[1], v20.d[0]
-; CHECK-FP16-GI-NEXT: mov v21.d[1], v22.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v2.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v4.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v6.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v16.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v4.2d, v24.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v5.2d, v17.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v6.2d, v19.2d
-; CHECK-FP16-GI-NEXT: fcvtzs v7.2d, v21.2d
+; CHECK-FP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-FP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.4s, v1.8h
+; CHECK-FP16-GI-NEXT: fcvtl v4.2d, v2.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v2.4s
+; CHECK-FP16-GI-NEXT: fcvtl v5.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v6.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtl v7.2d, v3.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v16.2d, v3.4s
+; CHECK-FP16-GI-NEXT: fcvtl v17.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v18.2d, v1.4s
+; CHECK-FP16-GI-NEXT: fcvtzs v0.2d, v4.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v2.2d, v5.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v3.2d, v6.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v4.2d, v7.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v5.2d, v16.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v6.2d, v17.2d
+; CHECK-FP16-GI-NEXT: fcvtzs v7.2d, v18.2d
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptosi <16 x half> %a to <16 x i64>
@@ -5480,52 +5396,26 @@ define <16 x i64> @fptou_v16f16_v16i64(<16 x half> %a) {
;
; CHECK-FP16-GI-LABEL: fptou_v16f16_v16i64:
; CHECK-FP16-GI: // %bb.0: // %entry
-; CHECK-FP16-GI-NEXT: mov h3, v0.h[1]
-; CHECK-FP16-GI-NEXT: mov h4, v0.h[2]
-; CHECK-FP16-GI-NEXT: mov h5, v0.h[3]
-; CHECK-FP16-GI-NEXT: fcvt d2, h0
-; CHECK-FP16-GI-NEXT: mov h6, v0.h[4]
-; CHECK-FP16-GI-NEXT: mov h7, v0.h[5]
-; CHECK-FP16-GI-NEXT: mov h16, v0.h[6]
-; CHECK-FP16-GI-NEXT: mov h0, v0.h[7]
-; CHECK-FP16-GI-NEXT: mov h17, v1.h[1]
-; CHECK-FP16-GI-NEXT: mov h18, v1.h[2]
-; CHECK-FP16-GI-NEXT: mov h19, v1.h[3]
-; CHECK-FP16-GI-NEXT: mov h20, v1.h[4]
-; CHECK-FP16-GI-NEXT: mov h21, v1.h[5]
-; CHECK-FP16-GI-NEXT: mov h22, v1.h[6]
-; CHECK-FP16-GI-NEXT: mov h23, v1.h[7]
-; CHECK-FP16-GI-NEXT: fcvt d3, h3
-; CHECK-FP16-GI-NEXT: fcvt d4, h4
-; CHECK-FP16-GI-NEXT: fcvt d5, h5
-; CHECK-FP16-GI-NEXT: fcvt d6, h6
-; CHECK-FP16-GI-NEXT: fcvt d7, h7
-; CHECK-FP16-GI-NEXT: fcvt d16, h16
-; CHECK-FP16-GI-NEXT: fcvt d0, h0
-; CHECK-FP16-GI-NEXT: fcvt d24, h1
-; CHECK-FP16-GI-NEXT: fcvt d1, h17
-; CHECK-FP16-GI-NEXT: fcvt d17, h18
-; CHECK-FP16-GI-NEXT: fcvt d18, h19
-; CHECK-FP16-GI-NEXT: fcvt d19, h20
-; CHECK-FP16-GI-NEXT: fcvt d20, h21
-; CHECK-FP16-GI-NEXT: fcvt d21, h22
-; CHECK-FP16-GI-NEXT: fcvt d22, h23
-; CHECK-FP16-GI-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-FP16-GI-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-FP16-GI-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-FP16-GI-NEXT: mov v16.d[1], v0.d[0]
-; CHECK-FP16-GI-NEXT: mov v24.d[1], v1.d[0]
-; CHECK-FP16-GI-NEXT: mov v17.d[1], v18.d[0]
-; CHECK-FP16-GI-NEXT: mov v19.d[1], v20.d[0]
-; CHECK-FP16-GI-NEXT: mov v21.d[1], v22.d[0]
-; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v2.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v4.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v6.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v16.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v4.2d, v24.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v5.2d, v17.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v6.2d, v19.2d
-; CHECK-FP16-GI-NEXT: fcvtzu v7.2d, v21.2d
+; CHECK-FP16-GI-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-FP16-GI-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-FP16-GI-NEXT: fcvtl2 v1.4s, v1.8h
+; CHECK-FP16-GI-NEXT: fcvtl v4.2d, v2.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v2.2d, v2.4s
+; CHECK-FP16-GI-NEXT: fcvtl v5.2d, v0.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v6.2d, v0.4s
+; CHECK-FP16-GI-NEXT: fcvtl v7.2d, v3.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v16.2d, v3.4s
+; CHECK-FP16-GI-NEXT: fcvtl v17.2d, v1.2s
+; CHECK-FP16-GI-NEXT: fcvtl2 v18.2d, v1.4s
+; CHECK-FP16-GI-NEXT: fcvtzu v0.2d, v4.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v1.2d, v2.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v2.2d, v5.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v3.2d, v6.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v4.2d, v7.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v5.2d, v16.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v6.2d, v17.2d
+; CHECK-FP16-GI-NEXT: fcvtzu v7.2d, v18.2d
; CHECK-FP16-GI-NEXT: ret
entry:
%c = fptoui <16 x half> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c74112937ba53..f1b13f93fb7d8 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -3083,30 +3083,14 @@ define <4 x i64> @test_signed_v4f16_v4i64(<4 x half> %f) {
; CHECK-SD-FP16-NEXT: mov v1.d[1], x11
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-CVT-LABEL: test_signed_v4f16_v4i64:
-; CHECK-GI-CVT: // %bb.0:
-; CHECK-GI-CVT-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-CVT-NEXT: fcvtl v1.2d, v0.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v2.2d, v0.4s
-; CHECK-GI-CVT-NEXT: fcvtzs v0.2d, v1.2d
-; CHECK-GI-CVT-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-CVT-NEXT: ret
-;
-; CHECK-GI-FP16-LABEL: test_signed_v4f16_v4i64:
-; CHECK-GI-FP16: // %bb.0:
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT: fcvt d0, h0
-; CHECK-GI-FP16-NEXT: fcvt d1, h1
-; CHECK-GI-FP16-NEXT: fcvt d2, h2
-; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-FP16-NEXT: ret
+; CHECK-GI-LABEL: test_signed_v4f16_v4i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-GI-NEXT: fcvtzs v0.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d
+; CHECK-GI-NEXT: ret
%x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f)
ret <4 x i64> %x
}
@@ -3792,46 +3776,19 @@ define <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
; CHECK-SD-FP16-NEXT: mov v3.d[1], x14
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-CVT-LABEL: test_signed_v8f16_v8i64:
-; CHECK-GI-CVT: // %bb.0:
-; CHECK-GI-CVT-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-CVT-NEXT: fcvtl2 v0.4s, v0.8h
-; CHECK-GI-CVT-NEXT: fcvtl v2.2d, v1.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v1.2d, v1.4s
-; CHECK-GI-CVT-NEXT: fcvtl v3.2d, v0.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v4.2d, v0.4s
-; CHECK-GI-CVT-NEXT: fcvtzs v0.2d, v2.2d
-; CHECK-GI-CVT-NEXT: fcvtzs v1.2d, v1.2d
-; CHECK-GI-CVT-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-GI-CVT-NEXT: fcvtzs v3.2d, v4.2d
-; CHECK-GI-CVT-NEXT: ret
-;
-; CHECK-GI-FP16-LABEL: test_signed_v8f16_v8i64:
-; CHECK-GI-FP16: // %bb.0:
-; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-FP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v0.h[6]
-; CHECK-GI-FP16-NEXT: mov h7, v0.h[7]
-; CHECK-GI-FP16-NEXT: fcvt d0, h0
-; CHECK-GI-FP16-NEXT: fcvt d1, h1
-; CHECK-GI-FP16-NEXT: fcvt d2, h2
-; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d4, h4
-; CHECK-GI-FP16-NEXT: fcvt d5, h5
-; CHECK-GI-FP16-NEXT: fcvt d6, h6
-; CHECK-GI-FP16-NEXT: fcvt d7, h7
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-FP16-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d
-; CHECK-GI-FP16-NEXT: fcvtzs v2.2d, v4.2d
-; CHECK-GI-FP16-NEXT: fcvtzs v3.2d, v6.2d
-; CHECK-GI-FP16-NEXT: ret
+; CHECK-GI-LABEL: test_signed_v8f16_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s
+; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-GI-NEXT: fcvtl v3.2d, v0.2s
+; CHECK-GI-NEXT: fcvtl2 v4.2d, v0.4s
+; CHECK-GI-NEXT: fcvtzs v0.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzs v2.2d, v3.2d
+; CHECK-GI-NEXT: fcvtzs v3.2d, v4.2d
+; CHECK-GI-NEXT: ret
%x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f)
ret <8 x i64> %x
}
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index efe0a1bedbc9e..b407b3c0e9940 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -2501,30 +2501,14 @@ define <4 x i64> @test_unsigned_v4f16_v4i64(<4 x half> %f) {
; CHECK-SD-FP16-NEXT: mov v1.d[1], x11
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-CVT-LABEL: test_unsigned_v4f16_v4i64:
-; CHECK-GI-CVT: // %bb.0:
-; CHECK-GI-CVT-NEXT: fcvtl v0.4s, v0.4h
-; CHECK-GI-CVT-NEXT: fcvtl v1.2d, v0.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v2.2d, v0.4s
-; CHECK-GI-CVT-NEXT: fcvtzu v0.2d, v1.2d
-; CHECK-GI-CVT-NEXT: fcvtzu v1.2d, v2.2d
-; CHECK-GI-CVT-NEXT: ret
-;
-; CHECK-GI-FP16-LABEL: test_unsigned_v4f16_v4i64:
-; CHECK-GI-FP16: // %bb.0:
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT: fcvt d0, h0
-; CHECK-GI-FP16-NEXT: fcvt d1, h1
-; CHECK-GI-FP16-NEXT: fcvt d2, h2
-; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d
-; CHECK-GI-FP16-NEXT: ret
+; CHECK-GI-LABEL: test_unsigned_v4f16_v4i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl v1.2d, v0.2s
+; CHECK-GI-NEXT: fcvtl2 v2.2d, v0.4s
+; CHECK-GI-NEXT: fcvtzu v0.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzu v1.2d, v2.2d
+; CHECK-GI-NEXT: ret
%x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f)
ret <4 x i64> %x
}
@@ -3109,46 +3093,19 @@ define <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
; CHECK-SD-FP16-NEXT: mov v3.d[1], x14
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-CVT-LABEL: test_unsigned_v8f16_v8i64:
-; CHECK-GI-CVT: // %bb.0:
-; CHECK-GI-CVT-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-CVT-NEXT: fcvtl2 v0.4s, v0.8h
-; CHECK-GI-CVT-NEXT: fcvtl v2.2d, v1.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v1.2d, v1.4s
-; CHECK-GI-CVT-NEXT: fcvtl v3.2d, v0.2s
-; CHECK-GI-CVT-NEXT: fcvtl2 v4.2d, v0.4s
-; CHECK-GI-CVT-NEXT: fcvtzu v0.2d, v2.2d
-; CHECK-GI-CVT-NEXT: fcvtzu v1.2d, v1.2d
-; CHECK-GI-CVT-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-GI-CVT-NEXT: fcvtzu v3.2d, v4.2d
-; CHECK-GI-CVT-NEXT: ret
-;
-; CHECK-GI-FP16-LABEL: test_unsigned_v8f16_v8i64:
-; CHECK-GI-FP16: // %bb.0:
-; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT: mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT: mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT: mov h4, v0.h[4]
-; CHECK-GI-FP16-NEXT: mov h5, v0.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v0.h[6]
-; CHECK-GI-FP16-NEXT: mov h7, v0.h[7]
-; CHECK-GI-FP16-NEXT: fcvt d0, h0
-; CHECK-GI-FP16-NEXT: fcvt d1, h1
-; CHECK-GI-FP16-NEXT: fcvt d2, h2
-; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d4, h4
-; CHECK-GI-FP16-NEXT: fcvt d5, h5
-; CHECK-GI-FP16-NEXT: fcvt d6, h6
-; CHECK-GI-FP16-NEXT: fcvt d7, h7
-; CHECK-GI-FP16-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-GI-FP16-NEXT: mov v2.d[1], v3.d[0]
-; CHECK-GI-FP16-NEXT: mov v4.d[1], v5.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d
-; CHECK-GI-FP16-NEXT: fcvtzu v2.2d, v4.2d
-; CHECK-GI-FP16-NEXT: fcvtzu v3.2d, v6.2d
-; CHECK-GI-FP16-NEXT: ret
+; CHECK-GI-LABEL: test_unsigned_v8f16_v8i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NEXT: fcvtl v2.2d, v1.2s
+; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-GI-NEXT: fcvtl v3.2d, v0.2s
+; CHECK-GI-NEXT: fcvtl2 v4.2d, v0.4s
+; CHECK-GI-NEXT: fcvtzu v0.2d, v2.2d
+; CHECK-GI-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-GI-NEXT: fcvtzu v2.2d, v3.2d
+; CHECK-GI-NEXT: fcvtzu v3.2d, v4.2d
+; CHECK-GI-NEXT: ret
%x = call <8 x i64> @llvm.fptoui.sat.v8f16.v8i64(<8 x half> %f)
ret <8 x i64> %x
}
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 1f84c944d7c16..de780bf475138 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -112,30 +112,22 @@ define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) {
;
; CHECK-GI-LABEL: fptrunc_v2f128_v2f16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sub sp, sp, #64
-; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #48
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT: mov v2.d[1], x8
-; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-GI-NEXT: bl __trunctfhf2
+; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: fmov d0, d1
+; CHECK-GI-NEXT: add sp, sp, #48
; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <2 x fp128> %a to <2 x half>
@@ -260,8 +252,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: fcvt s2, d2
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
@@ -284,61 +277,49 @@ entry:
}
define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
-; CHECK-SD-LABEL: fptrunc_v2f64_v2f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptrunc_v2f64_v2f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: fcvt h0, d0
-; CHECK-GI-NEXT: fcvt h1, d1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptrunc_v2f64_v2f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptrunc <2 x double> %a to <2 x half>
ret <2 x half> %c
}
define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
-; CHECK-LABEL: fptrunc_v3f64_v3f16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt h0, d0
-; CHECK-NEXT: fcvt h1, d1
-; CHECK-NEXT: fcvt h2, d2
-; CHECK-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptrunc_v3f64_v3f16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvt h0, d0
+; CHECK-SD-NEXT: fcvt h1, d1
+; CHECK-SD-NEXT: fcvt h2, d2
+; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptrunc_v3f64_v3f16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtxn2 v0.4s, v2.2d
+; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <3 x double> %a to <3 x half>
ret <3 x half> %c
}
define <4 x half> @fptrunc_v4f64_v4f16(<4 x double> %a) {
-; CHECK-SD-LABEL: fptrunc_v4f64_v4f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptrunc_v4f64_v4f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov d2, v0.d[1]
-; CHECK-GI-NEXT: fcvt h0, d0
-; CHECK-GI-NEXT: mov d3, v1.d[1]
-; CHECK-GI-NEXT: fcvt h1, d1
-; CHECK-GI-NEXT: fcvt h2, d2
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: fcvt h2, d3
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptrunc_v4f64_v4f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptrunc <4 x double> %a to <4 x half>
ret <4 x half> %c
>From 8f86ebca0170bcc0dc3b1dd44271f38d596d54e6 Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan at arm.com>
Date: Fri, 17 Oct 2025 13:04:54 +0000
Subject: [PATCH 2/2] Address review comments
---
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 14 ++++----
.../AArch64/GISel/AArch64LegalizerInfo.h | 4 +--
.../GISel/AArch64PostLegalizerLowering.cpp | 32 +++++++++----------
3 files changed, 23 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fde86449a76a7..3f3f1be2271c0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1491,11 +1491,10 @@ bool AArch64LegalizerInfo::legalizeCustom(
case TargetOpcode::G_BITCAST:
return legalizeBitcast(MI, Helper);
case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPTRUNC:
// In order to vectorise f16 to f64 properly, we need to use f32 as an
// intermediary
- return legalizeViaF32(MI, MIRBuilder, MRI, TargetOpcode::G_FPEXT);
- case TargetOpcode::G_FPTRUNC:
- return legalizeViaF32(MI, MIRBuilder, MRI, TargetOpcode::G_FPTRUNC);
+ return legalizeFpextFptrunc(MI, MIRBuilder, MRI);
}
llvm_unreachable("expected switch to return");
@@ -2421,10 +2420,9 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
return true;
}
-bool AArch64LegalizerInfo::legalizeViaF32(MachineInstr &MI,
- MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI,
- unsigned Opcode) const {
+bool AArch64LegalizerInfo::legalizeFpextFptrunc(
+ MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
LLT DstTy = MRI.getType(Dst);
@@ -2435,7 +2433,7 @@ bool AArch64LegalizerInfo::legalizeViaF32(MachineInstr &MI,
MachineInstrBuilder Mid;
MachineInstrBuilder Fin;
MIRBuilder.setInstrAndDebugLoc(MI);
- switch (Opcode) {
+ switch (MI.getOpcode()) {
default:
return false;
case TargetOpcode::G_FPEXT: {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 049808d66f983..15999a44174d3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -67,8 +67,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizePrefetch(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeBitcast(MachineInstr &MI, LegalizerHelper &Helper) const;
- bool legalizeViaF32(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, unsigned Opcode) const;
+ bool legalizeFpextFptrunc(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index e675fac0f13ac..2fa1b86a8a9c6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -904,7 +904,7 @@ unsigned getCmpOperandFoldingProfit(Register CmpOp, MachineRegisterInfo &MRI) {
// Helper function for matchFpTruncFpTrunc.
// Checks that the given definition belongs to an FPTRUNC and that the source is
// not an integer, as no rounding is necessary due to the range of values
-bool checkTruncSrc(MachineRegisterInfo &MRI, MachineInstr *MaybeFpTrunc) {
+bool isFPTruncFromDouble(MachineRegisterInfo &MRI, MachineInstr *MaybeFpTrunc) {
if (!MaybeFpTrunc || MaybeFpTrunc->getOpcode() != TargetOpcode::G_FPTRUNC)
return false;
@@ -930,8 +930,7 @@ bool checkTruncSrc(MachineRegisterInfo &MRI, MachineInstr *MaybeFpTrunc) {
// truncating an FP that came from an integer this is not a problem as the range
// of values is lower in the int
bool matchFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI) {
- if (MI.getOpcode() != TargetOpcode::G_FPTRUNC)
- return false;
+ assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC && "Expected G_FPTRUNC");
// Check the destination is 16 bits as we only want to match a very specific
// pattern
@@ -959,10 +958,9 @@ bool matchFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI) {
for (unsigned OpIdx = 1, NumOperands = ParentDef->getNumOperands();
OpIdx != NumOperands; ++OpIdx) {
Register FpTruncDst = ParentDef->getOperand(OpIdx).getReg();
-
FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
- if (!checkTruncSrc(MRI, FpTruncDef))
+ if (!isFPTruncFromDouble(MRI, FpTruncDef))
return false;
}
@@ -973,41 +971,43 @@ bool matchFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI) {
Register VecExtractDst = ParentDef->getOperand(2).getReg();
MachineInstr *VecExtractDef = getDefIgnoringCopies(VecExtractDst, MRI);
+ if (!VecExtractDef ||
+ VecExtractDef->getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT)
+ return false;
+
Register FpTruncDst = VecExtractDef->getOperand(1).getReg();
FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
-
- if (!checkTruncSrc(MRI, FpTruncDef))
- return false;
break;
}
case TargetOpcode::G_FPTRUNC: {
Register FpTruncDst = ParentDef->getOperand(1).getReg();
FpTruncDef = getDefIgnoringCopies(FpTruncDst, MRI);
-
- if (!checkTruncSrc(MRI, FpTruncDef))
- return false;
break;
}
}
+ if (!isFPTruncFromDouble(MRI, FpTruncDef))
+ return false;
+
return true;
}
void applyFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) {
+ assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC && "Expected G_FPTRUNC");
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
+ MachineInstr *ParentDef = getDefIgnoringCopies(Src, MRI);
+ if (!ParentDef)
+ return;
+
LLT V2F32 = LLT::fixed_vector(2, LLT::scalar(32));
LLT V4F32 = LLT::fixed_vector(4, LLT::scalar(32));
LLT V4F16 = LLT::fixed_vector(4, LLT::scalar(16));
B.setInstrAndDebugLoc(MI);
- MachineInstr *ParentDef = getDefIgnoringCopies(Src, MRI);
- if (!ParentDef)
- return;
-
switch (ParentDef->getOpcode()) {
default:
return;
@@ -1056,8 +1056,6 @@ void applyFpTruncFpTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
Register HiFp64 = FpTrunc2Def->getOperand(1).getReg();
MRI.setRegClass(HiFp64, &AArch64::FPR128RegClass);
- B.setInstrAndDebugLoc(MI);
-
// Convert the lower half
Register LoFp32 = MRI.createGenericVirtualRegister(V2F32);
MRI.setRegClass(LoFp32, &AArch64::FPR64RegClass);
More information about the llvm-commits
mailing list