[llvm] cc9d3f2 - [AArch64][GlobalISel] Improve lowering of vector fp16 fptrunc (#163398)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 00:32:30 PST 2025
Author: Ryan Cowan
Date: 2025-11-28T08:32:25Z
New Revision: cc9d3f29eca70394eb9e8f1d8e7fb08bce60f61f
URL: https://github.com/llvm/llvm-project/commit/cc9d3f29eca70394eb9e8f1d8e7fb08bce60f61f
DIFF: https://github.com/llvm/llvm-project/commit/cc9d3f29eca70394eb9e8f1d8e7fb08bce60f61f.diff
LOG: [AArch64][GlobalISel] Improve lowering of vector fp16 fptrunc (#163398)
This commit improves the lowering of vectors of fp16 when truncating and
(previously) extending. Truncating has to be handled in a specific way
to avoid double rounding.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64InstrGISel.td
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
llvm/test/CodeGen/AArch64/arm64-fp128.ll
llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
llvm/test/CodeGen/AArch64/fptrunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 52b216c7fe0f0..7d99786830e3d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -149,6 +149,13 @@ def G_VLSHR : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+// Float truncation using round to odd
+def G_FPTRUNC_ODD : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = false;
+}
+
// Represents an integer to FP conversion on the FPR bank.
def G_SITOF : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
@@ -297,6 +304,8 @@ def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
def : GINodeEquiv<G_AARCH64_PREFETCH, AArch64Prefetch>;
+def : GINodeEquiv<G_FPTRUNC_ODD, AArch64fcvtxn_n>;
+
// These are patterns that we only use for GlobalISel via the importer.
def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
(vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 089b0b2feb231..1025b2502211a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -820,8 +821,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalFor(
{{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
.libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
- .clampNumElements(0, v4s16, v4s16)
- .clampNumElements(0, v2s32, v2s32)
+ .moreElementsToNextPow2(1)
+ .customIf([](const LegalityQuery &Q) {
+ LLT DstTy = Q.Types[0];
+ LLT SrcTy = Q.Types[1];
+ return SrcTy.isFixedVector() && DstTy.isFixedVector() &&
+ SrcTy.getScalarSizeInBits() == 64 &&
+ DstTy.getScalarSizeInBits() == 16;
+ })
+ // Clamp based on input
+ .clampNumElements(1, v4s32, v4s32)
+ .clampNumElements(1, v2s64, v2s64)
.scalarize(0);
getActionDefinitionsBuilder(G_FPEXT)
@@ -1479,6 +1489,10 @@ bool AArch64LegalizerInfo::legalizeCustom(
return legalizeICMP(MI, MRI, MIRBuilder);
case TargetOpcode::G_BITCAST:
return legalizeBitcast(MI, Helper);
+ case TargetOpcode::G_FPTRUNC:
+ // In order to lower f16 to f64 properly, we need to use f32 as an
+ // intermediary
+ return legalizeFptrunc(MI, MIRBuilder, MRI);
}
llvm_unreachable("expected switch to return");
@@ -2416,3 +2430,80 @@ bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
MI.eraseFromParent();
return true;
}
+
+bool AArch64LegalizerInfo::legalizeFptrunc(MachineInstr &MI,
+ MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) const {
+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+ assert(SrcTy.isFixedVector() && isPowerOf2_32(SrcTy.getNumElements()) &&
+ "Expected a power of 2 elements");
+
+ LLT s16 = LLT::scalar(16);
+ LLT s32 = LLT::scalar(32);
+ LLT s64 = LLT::scalar(64);
+ LLT v2s16 = LLT::fixed_vector(2, s16);
+ LLT v4s16 = LLT::fixed_vector(4, s16);
+ LLT v2s32 = LLT::fixed_vector(2, s32);
+ LLT v4s32 = LLT::fixed_vector(4, s32);
+ LLT v2s64 = LLT::fixed_vector(2, s64);
+
+ SmallVector<Register> RegsToUnmergeTo;
+ SmallVector<Register> TruncOddDstRegs;
+ SmallVector<Register> RegsToMerge;
+
+ unsigned ElemCount = SrcTy.getNumElements();
+
+ // Find the biggest size chunks we can work with
+ int StepSize = ElemCount % 4 ? 2 : 4;
+
+ // If we have a power of 2 greater than 2, we need to first unmerge into
+ // enough pieces
+ if (ElemCount <= 2)
+ RegsToUnmergeTo.push_back(Src);
+ else {
+ for (unsigned i = 0; i < ElemCount / 2; ++i)
+ RegsToUnmergeTo.push_back(MRI.createGenericVirtualRegister(v2s64));
+
+ MIRBuilder.buildUnmerge(RegsToUnmergeTo, Src);
+ }
+
+ // Create all of the round-to-odd instructions and store them
+ for (auto SrcReg : RegsToUnmergeTo) {
+ Register Mid =
+ MIRBuilder.buildInstr(AArch64::G_FPTRUNC_ODD, {v2s32}, {SrcReg})
+ .getReg(0);
+ TruncOddDstRegs.push_back(Mid);
+ }
+
+ // Truncate 4s32 to 4s16 if we can to reduce instruction count, otherwise
+ // truncate 2s32 to 2s16.
+ unsigned Index = 0;
+ for (unsigned LoopIter = 0; LoopIter < ElemCount / StepSize; ++LoopIter) {
+ if (StepSize == 4) {
+ Register ConcatDst =
+ MIRBuilder
+ .buildMergeLikeInstr(
+ {v4s32}, {TruncOddDstRegs[Index++], TruncOddDstRegs[Index++]})
+ .getReg(0);
+
+ RegsToMerge.push_back(
+ MIRBuilder.buildFPTrunc(v4s16, ConcatDst).getReg(0));
+ } else {
+ RegsToMerge.push_back(
+ MIRBuilder.buildFPTrunc(v2s16, TruncOddDstRegs[Index++]).getReg(0));
+ }
+ }
+
+ // If there is only one register, replace the destination
+ if (RegsToMerge.size() == 1) {
+ MRI.replaceRegWith(Dst, RegsToMerge.pop_back_val());
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Merge the rest of the instructions & replace the register
+ Register Fin = MIRBuilder.buildMergeLikeInstr(DstTy, RegsToMerge).getReg(0);
+ MRI.replaceRegWith(Dst, Fin);
+ MI.eraseFromParent();
+ return true;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index bcb294326fa92..12b6a6fa395a8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -67,6 +67,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizePrefetch(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeBitcast(MachineInstr &MI, LegalizerHelper &Helper) const;
+ bool legalizeFptrunc(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 4a73065ac363c..e68278dadc4b8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -578,8 +578,8 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_FPTOSI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index 8dd5c3ac05109..498dce138febf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1197,30 +1197,22 @@ define <2 x half> @vec_round_f16(<2 x fp128> %val) {
;
; CHECK-GI-LABEL: vec_round_f16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sub sp, sp, #64
-; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #48
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: str q1, [sp] // 16-byte Spill
-; CHECK-GI-NEXT: mov v2.d[1], x8
-; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Spill
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Spill
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Reload
; CHECK-GI-NEXT: bl __trunctfhf2
+; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Reload
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Spill
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Reload
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: fmov d0, d1
+; CHECK-GI-NEXT: add sp, sp, #48
; CHECK-GI-NEXT: ret
%dst = fptrunc <2 x fp128> %val to <2 x half>
ret <2 x half> %dst
diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 1e1e25c04b384..760742a4efad7 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -170,47 +170,12 @@ define <4 x half> @s_to_h(<4 x float> %a) {
}
define <4 x half> @d_to_h(<4 x double> %a) {
-; CHECK-CVT-SD-LABEL: d_to_h:
-; CHECK-CVT-SD: // %bb.0:
-; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-CVT-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-CVT-SD-NEXT: ret
-;
-; CHECK-FP16-SD-LABEL: d_to_h:
-; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-FP16-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-FP16-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-FP16-SD-NEXT: ret
-;
-; CHECK-CVT-GI-LABEL: d_to_h:
-; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov d2, v0.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h0, d0
-; CHECK-CVT-GI-NEXT: mov d3, v1.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: fcvt h2, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h2, d3
-; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-CVT-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-CVT-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-CVT-GI-NEXT: ret
-;
-; CHECK-FP16-GI-LABEL: d_to_h:
-; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: mov d2, v0.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h0, d0
-; CHECK-FP16-GI-NEXT: mov d3, v1.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: fcvt h2, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h2, d3
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-FP16-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-FP16-GI-NEXT: ret
+; CHECK-LABEL: d_to_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
%1 = fptrunc <4 x double> %a to <4 x half>
ret <4 x half> %1
}
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 7b152bcccf1e5..f94f8b449c59b 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -176,71 +176,15 @@ define <8 x half> @s_to_h(<8 x float> %a) {
}
define <8 x half> @d_to_h(<8 x double> %a) {
-; CHECK-CVT-SD-LABEL: d_to_h:
-; CHECK-CVT-SD: // %bb.0:
-; CHECK-CVT-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-CVT-SD-NEXT: fcvtxn v2.2s, v2.2d
-; CHECK-CVT-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-CVT-SD-NEXT: fcvtxn2 v2.4s, v3.2d
-; CHECK-CVT-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-CVT-SD-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-CVT-SD-NEXT: ret
-;
-; CHECK-FP16-SD-LABEL: d_to_h:
-; CHECK-FP16-SD: // %bb.0:
-; CHECK-FP16-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-FP16-SD-NEXT: fcvtxn v2.2s, v2.2d
-; CHECK-FP16-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-FP16-SD-NEXT: fcvtxn2 v2.4s, v3.2d
-; CHECK-FP16-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-FP16-SD-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-FP16-SD-NEXT: ret
-;
-; CHECK-CVT-GI-LABEL: d_to_h:
-; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov d4, v0.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h0, d0
-; CHECK-CVT-GI-NEXT: mov d5, v1.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: fcvt h4, d4
-; CHECK-CVT-GI-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h4, d5
-; CHECK-CVT-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-CVT-GI-NEXT: mov d1, v2.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h2, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h1, d1
-; CHECK-CVT-GI-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-CVT-GI-NEXT: mov d2, v3.d[1]
-; CHECK-CVT-GI-NEXT: fcvt h3, d3
-; CHECK-CVT-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-CVT-GI-NEXT: fcvt h1, d2
-; CHECK-CVT-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-CVT-GI-NEXT: mov v0.h[7], v1.h[0]
-; CHECK-CVT-GI-NEXT: ret
-;
-; CHECK-FP16-GI-LABEL: d_to_h:
-; CHECK-FP16-GI: // %bb.0:
-; CHECK-FP16-GI-NEXT: mov d4, v0.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h0, d0
-; CHECK-FP16-GI-NEXT: mov d5, v1.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: fcvt h4, d4
-; CHECK-FP16-GI-NEXT: mov v0.h[1], v4.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h4, d5
-; CHECK-FP16-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-FP16-GI-NEXT: mov d1, v2.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h2, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h1, d1
-; CHECK-FP16-GI-NEXT: mov v0.h[4], v2.h[0]
-; CHECK-FP16-GI-NEXT: mov d2, v3.d[1]
-; CHECK-FP16-GI-NEXT: fcvt h3, d3
-; CHECK-FP16-GI-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-FP16-GI-NEXT: fcvt h1, d2
-; CHECK-FP16-GI-NEXT: mov v0.h[6], v3.h[0]
-; CHECK-FP16-GI-NEXT: mov v0.h[7], v1.h[0]
-; CHECK-FP16-GI-NEXT: ret
+; CHECK-LABEL: d_to_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn v2.2s, v2.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
+; CHECK-NEXT: ret
%1 = fptrunc <8 x double> %a to <8 x half>
ret <8 x half> %1
}
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index ae86129286ddc..56b20eaac1c80 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -112,30 +112,22 @@ define <2 x half> @fptrunc_v2f128_v2f16(<2 x fp128> %a) {
;
; CHECK-GI-LABEL: fptrunc_v2f128_v2f16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sub sp, sp, #64
-; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: sub sp, sp, #48
+; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 48
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: mov v2.d[0], x8
; CHECK-GI-NEXT: str q1, [sp] // 16-byte Spill
-; CHECK-GI-NEXT: mov v2.d[1], x8
-; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Spill
; CHECK-GI-NEXT: bl __trunctfhf2
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Spill
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Reload
; CHECK-GI-NEXT: bl __trunctfhf2
+; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Reload
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-NEXT: str q0, [sp] // 16-byte Spill
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Reload
-; CHECK-GI-NEXT: bl __trunctfhf2
-; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Reload
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Reload
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT: fmov d0, d1
+; CHECK-GI-NEXT: add sp, sp, #48
; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <2 x fp128> %a to <2 x half>
@@ -260,8 +252,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: fcvt s2, d2
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d
; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d
; CHECK-GI-NEXT: mov v0.s[0], v1.s[0]
; CHECK-GI-NEXT: mov v0.s[1], v1.s[1]
@@ -284,61 +277,49 @@ entry:
}
define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
-; CHECK-SD-LABEL: fptrunc_v2f64_v2f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptrunc_v2f64_v2f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov d1, v0.d[1]
-; CHECK-GI-NEXT: fcvt h0, d0
-; CHECK-GI-NEXT: fcvt h1, d1
-; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptrunc_v2f64_v2f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptrunc <2 x double> %a to <2 x half>
ret <2 x half> %c
}
define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
-; CHECK-LABEL: fptrunc_v3f64_v3f16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt h0, d0
-; CHECK-NEXT: fcvt h1, d1
-; CHECK-NEXT: fcvt h2, d2
-; CHECK-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fptrunc_v3f64_v3f16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvt h0, d0
+; CHECK-SD-NEXT: fcvt h1, d1
+; CHECK-SD-NEXT: fcvt h2, d2
+; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-SD-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fptrunc_v3f64_v3f16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtxn2 v0.4s, v2.2d
+; CHECK-GI-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
entry:
%c = fptrunc <3 x double> %a to <3 x half>
ret <3 x half> %c
}
define <4 x half> @fptrunc_v4f64_v4f16(<4 x double> %a) {
-; CHECK-SD-LABEL: fptrunc_v4f64_v4f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptrunc_v4f64_v4f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov d2, v0.d[1]
-; CHECK-GI-NEXT: fcvt h0, d0
-; CHECK-GI-NEXT: mov d3, v1.d[1]
-; CHECK-GI-NEXT: fcvt h1, d1
-; CHECK-GI-NEXT: fcvt h2, d2
-; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT: fcvt h2, d3
-; CHECK-GI-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptrunc_v4f64_v4f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptrunc <4 x double> %a to <4 x half>
ret <4 x half> %c
More information about the llvm-commits
mailing list