[llvm] 5f935e9 - [AArch64] Optimize fp64 <-> fp16 SIMD conversions
David Majnemer via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 8 12:00:01 PST 2024
Author: David Majnemer
Date: 2024-03-08T19:52:53Z
New Revision: 5f935e91810eb28854611faf13bb7d07a8dbf470
URL: https://github.com/llvm/llvm-project/commit/5f935e91810eb28854611faf13bb7d07a8dbf470
DIFF: https://github.com/llvm/llvm-project/commit/5f935e91810eb28854611faf13bb7d07a8dbf470.diff
LOG: [AArch64] Optimize fp64 <-> fp16 SIMD conversions
Legalization would result in needless scalarization. Add some
DAGCombines to fix this up.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
llvm/test/CodeGen/AArch64/fpext.ll
llvm/test/CodeGen/AArch64/fptrunc.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
llvm/test/CodeGen/AArch64/vector-fcopysign.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2290223a06f8ef..89b697b2d51528 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4507,13 +4507,16 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
};
if (Op.getValueType() == MVT::bf16) {
+ unsigned MaxWidth = IsSigned
+ ? DAG.ComputeMaxSignificantBits(SrcVal)
+ : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
// bf16 conversions are promoted to f32 when converting from i16.
- if (DAG.ComputeMaxSignificantBits(SrcVal) <= 24) {
+ if (MaxWidth <= 24) {
return IntToFpViaPromotion(MVT::f32);
}
// bf16 conversions are promoted to f64 when converting from i32.
- if (DAG.ComputeMaxSignificantBits(SrcVal) <= 53) {
+ if (MaxWidth <= 53) {
return IntToFpViaPromotion(MVT::f64);
}
@@ -19376,6 +19379,94 @@ static SDValue performBuildVectorCombine(SDNode *N,
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
+ SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
+ Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
+ if (Elt0->getOpcode() == ISD::FP_ROUND &&
+ Elt1->getOpcode() == ISD::FP_ROUND &&
+ isa<ConstantSDNode>(Elt0->getOperand(1)) &&
+ isa<ConstantSDNode>(Elt1->getOperand(1)) &&
+ Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
+ Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ // Constant index.
+ isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
+ isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
+ Elt0->getOperand(0)->getOperand(0) ==
+ Elt1->getOperand(0)->getOperand(0) &&
+ Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
+ Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
+ SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
+ if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
+ SDValue HighLanes;
+ if (Elt2->getOpcode() == ISD::UNDEF &&
+ Elt3->getOpcode() == ISD::UNDEF) {
+ HighLanes = DAG.getUNDEF(MVT::v2f32);
+ } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
+ Elt3->getOpcode() == ISD::FP_ROUND &&
+ isa<ConstantSDNode>(Elt2->getOperand(1)) &&
+ isa<ConstantSDNode>(Elt3->getOperand(1)) &&
+ Elt2->getConstantOperandVal(1) ==
+ Elt3->getConstantOperandVal(1) &&
+ Elt2->getOperand(0)->getOpcode() ==
+ ISD::EXTRACT_VECTOR_ELT &&
+ Elt3->getOperand(0)->getOpcode() ==
+ ISD::EXTRACT_VECTOR_ELT &&
+ // Constant index.
+ isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
+ isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
+ Elt2->getOperand(0)->getOperand(0) ==
+ Elt3->getOperand(0)->getOperand(0) &&
+ Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
+ Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
+ SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
+ HighLanes =
+ DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
+ }
+ if (HighLanes) {
+ SDValue DoubleToSingleSticky =
+ DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+ DoubleToSingleSticky, HighLanes);
+ return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
+ Elt0->getOperand(1));
+ }
+ }
+ }
+ }
+
+ if (VT == MVT::v2f64) {
+ SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
+ if (Elt0->getOpcode() == ISD::FP_EXTEND &&
+ Elt1->getOpcode() == ISD::FP_EXTEND &&
+ Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Elt0->getOperand(0)->getOperand(0) ==
+ Elt1->getOperand(0)->getOperand(0) &&
+ // Constant index.
+ isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
+ isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
+ Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
+ Elt1->getOperand(0)->getConstantOperandVal(1) &&
+ // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
+ // ResultType's known minimum vector length.
+ Elt0->getOperand(0)->getConstantOperandVal(1) %
+ VT.getVectorMinNumElements() ==
+ 0) {
+ SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
+ if (SrcVec.getValueType() == MVT::v4f16 ||
+ SrcVec.getValueType() == MVT::v4bf16) {
+ SDValue HalfToSingle =
+ DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
+ SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
+ SDValue Extract = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
+ HalfToSingle, SubvectorIdx);
+ return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
+ }
+ }
+ }
+
// A build vector of two extracted elements is equivalent to an
// extract subvector where the inner vector is any-extended to the
// extract_vector_elt VT.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 091db559a33708..8360bef8e2f826 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6832,7 +6832,7 @@ multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
}
multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
- Intrinsic OpNode> {
+ SDPatternOperator OpNode> {
def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
asm, ".2s", ".2d",
[(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
@@ -7547,7 +7547,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
: I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
- [(set (f32 FPR32:$Rd), (AArch64fcvtxn (f64 FPR64:$Rn)))]>,
+ [(set (f32 FPR32:$Rd), (AArch64fcvtxnsdr (f64 FPR64:$Rn)))]>,
Sched<[WriteVd]> {
bits<5> Rd;
bits<5> Rn;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f8c6d9019ef6e1..3c67f616c1b9ce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -757,9 +757,12 @@ def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
def AArch64fcvtxn_n: SDNode<"AArch64ISD::FCVTXN", SDTFPRoundOp>;
-def AArch64fcvtxn: PatFrags<(ops node:$Rn),
- [(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
- (f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
+def AArch64fcvtxnsdr: PatFrags<(ops node:$Rn),
+ [(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
+ (f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
+def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
+ [(int_aarch64_neon_fcvtxn node:$Rn),
+ (AArch64fcvtxn_n node:$Rn)]>;
def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
@@ -5042,7 +5045,7 @@ def : Pat<(concat_vectors V64:$Rd, (v4f16 (any_fpround (v4f32 V128:$Rn)))),
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
- int_aarch64_neon_fcvtxn>;
+ AArch64fcvtxnv>;
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index aa6b7cb495f189..cafee32ada6868 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -199,6 +199,60 @@ define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
ret <2 x float> %vcvt1.i
}
+; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
+; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
+define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
+; GENERIC-LABEL: test_vcvt_bf16_f64:
+; GENERIC: // %bb.0:
+; GENERIC-NEXT: fcvtxn v0.2s, v0.2d
+; GENERIC-NEXT: movi.4s v1, #127, msl #8
+; GENERIC-NEXT: movi.4s v2, #1
+; GENERIC-NEXT: ushr.4s v3, v0, #16
+; GENERIC-NEXT: add.4s v1, v0, v1
+; GENERIC-NEXT: and.16b v2, v3, v2
+; GENERIC-NEXT: add.4s v1, v2, v1
+; GENERIC-NEXT: fcmeq.4s v2, v0, v0
+; GENERIC-NEXT: orr.4s v0, #64, lsl #16
+; GENERIC-NEXT: bit.16b v0, v1, v2
+; GENERIC-NEXT: shrn.4h v0, v0, #16
+; GENERIC-NEXT: ret
+;
+; FAST-LABEL: test_vcvt_bf16_f64:
+; FAST: // %bb.0:
+; FAST-NEXT: fcvtxn v1.2s, v0.2d
+; FAST-NEXT: // implicit-def: $q0
+; FAST-NEXT: fmov d0, d1
+; FAST-NEXT: ushr.4s v1, v0, #16
+; FAST-NEXT: movi.4s v2, #1
+; FAST-NEXT: and.16b v1, v1, v2
+; FAST-NEXT: add.4s v1, v1, v0
+; FAST-NEXT: movi.4s v2, #127, msl #8
+; FAST-NEXT: add.4s v1, v1, v2
+; FAST-NEXT: mov.16b v2, v0
+; FAST-NEXT: orr.4s v2, #64, lsl #16
+; FAST-NEXT: fcmeq.4s v0, v0, v0
+; FAST-NEXT: bsl.16b v0, v1, v2
+; FAST-NEXT: shrn.4h v0, v0, #16
+; FAST-NEXT: ret
+;
+; GISEL-LABEL: test_vcvt_bf16_f64:
+; GISEL: // %bb.0:
+; GISEL-NEXT: fcvtxn v0.2s, v0.2d
+; GISEL-NEXT: movi.4s v1, #127, msl #8
+; GISEL-NEXT: movi.4s v2, #1
+; GISEL-NEXT: ushr.4s v3, v0, #16
+; GISEL-NEXT: add.4s v1, v0, v1
+; GISEL-NEXT: and.16b v2, v3, v2
+; GISEL-NEXT: add.4s v1, v2, v1
+; GISEL-NEXT: fcmeq.4s v2, v0, v0
+; GISEL-NEXT: orr.4s v0, #64, lsl #16
+; GISEL-NEXT: bit.16b v0, v1, v2
+; GISEL-NEXT: shrn.4h v0, v0, #16
+; GISEL-NEXT: ret
+ %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
+ ret <2 x bfloat> %vcvt1.i
+}
+
define half @test_vcvt_f16_f32(<1 x float> %x) {
; GENERIC-LABEL: test_vcvt_f16_f32:
; GENERIC: // %bb.0:
@@ -350,3 +404,5 @@ define float @from_half(i16 %in) {
declare float @llvm.convert.from.fp16.f32(i16) #1
declare i16 @llvm.convert.to.fp16.f32(float) #1
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 7ff61d9bcb0cfc..ded343b990ac15 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -312,25 +312,12 @@ define <8 x half> @s_to_h(<8 x float> %a) {
define <8 x half> @d_to_h(<8 x double> %a) {
; CHECK-LABEL: d_to_h:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov d5, v0.d[1]
-; CHECK-NEXT: fcvt h0, d0
-; CHECK-NEXT: fcvt h4, d1
-; CHECK-NEXT: mov d1, v1.d[1]
-; CHECK-NEXT: fcvt h5, d5
-; CHECK-NEXT: fcvt h1, d1
-; CHECK-NEXT: mov v0.h[1], v5.h[0]
-; CHECK-NEXT: mov v0.h[2], v4.h[0]
-; CHECK-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-NEXT: fcvt h1, d2
-; CHECK-NEXT: mov d2, v2.d[1]
-; CHECK-NEXT: mov v0.h[4], v1.h[0]
-; CHECK-NEXT: fcvt h1, d2
-; CHECK-NEXT: mov d2, v3.d[1]
-; CHECK-NEXT: mov v0.h[5], v1.h[0]
-; CHECK-NEXT: fcvt h1, d3
-; CHECK-NEXT: mov v0.h[6], v1.h[0]
-; CHECK-NEXT: fcvt h1, d2
-; CHECK-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-NEXT: fcvtxn v2.2s, v2.2d
+; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d
+; CHECK-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-NEXT: ret
%1 = fptrunc <8 x double> %a to <8 x half>
ret <8 x half> %1
@@ -349,25 +336,12 @@ define <8 x float> @h_to_s(<8 x half> %a) {
define <8 x double> @h_to_d(<8 x half> %a) {
; CHECK-LABEL: h_to_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: mov h1, v0.h[1]
-; CHECK-NEXT: mov h3, v0.h[3]
-; CHECK-NEXT: mov h4, v0.h[2]
-; CHECK-NEXT: fcvt d0, h0
-; CHECK-NEXT: mov h5, v2.h[1]
-; CHECK-NEXT: mov h6, v2.h[3]
-; CHECK-NEXT: mov h7, v2.h[2]
-; CHECK-NEXT: fcvt d16, h1
-; CHECK-NEXT: fcvt d17, h3
-; CHECK-NEXT: fcvt d1, h4
-; CHECK-NEXT: fcvt d2, h2
-; CHECK-NEXT: fcvt d4, h5
-; CHECK-NEXT: fcvt d5, h6
-; CHECK-NEXT: fcvt d3, h7
-; CHECK-NEXT: mov v0.d[1], v16.d[0]
-; CHECK-NEXT: mov v1.d[1], v17.d[0]
-; CHECK-NEXT: mov v2.d[1], v4.d[0]
-; CHECK-NEXT: mov v3.d[1], v5.d[0]
+; CHECK-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-NEXT: fcvtl2 v2.4s, v0.8h
+; CHECK-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-NEXT: fcvtl2 v3.2d, v2.4s
+; CHECK-NEXT: fcvtl2 v1.2d, v1.4s
+; CHECK-NEXT: fcvtl v2.2d, v2.2s
; CHECK-NEXT: ret
%1 = fpext <8 x half> %a to <8 x double>
ret <8 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index eca3389bcd88b5..86f7322f7c4eeb 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -85,29 +85,46 @@ entry:
}
define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) {
-; CHECK-LABEL: fpext_v2f16_v2f64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov h1, v0.h[1]
-; CHECK-NEXT: fcvt d0, h0
-; CHECK-NEXT: fcvt d1, h1
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fpext_v2f16_v2f64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fpext_v2f16_v2f64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: fcvt d0, h0
+; CHECK-GI-NEXT: fcvt d1, h1
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: ret
entry:
%c = fpext <2 x half> %a to <2 x double>
ret <2 x double> %c
}
define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
-; CHECK-LABEL: fpext_v3f16_v3f64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov h1, v0.h[1]
-; CHECK-NEXT: mov h2, v0.h[2]
-; CHECK-NEXT: fcvt d0, h0
-; CHECK-NEXT: fcvt d1, h1
-; CHECK-NEXT: fcvt d2, h2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fpext_v3f16_v3f64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-SD-NEXT: fcvtl v0.2d, v1.2s
+; CHECK-SD-NEXT: fcvtl2 v2.2d, v1.4s
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fpext_v3f16_v3f64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NEXT: mov h2, v0.h[2]
+; CHECK-GI-NEXT: fcvt d0, h0
+; CHECK-GI-NEXT: fcvt d1, h1
+; CHECK-GI-NEXT: fcvt d2, h2
+; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x half> %a to <3 x double>
ret <3 x double> %c
@@ -116,16 +133,9 @@ entry:
define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) {
; CHECK-SD-LABEL: fpext_v4f16_v4f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NEXT: mov h2, v0.h[3]
-; CHECK-SD-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NEXT: fcvt d0, h0
-; CHECK-SD-NEXT: fcvt d4, h1
-; CHECK-SD-NEXT: fcvt d2, h2
-; CHECK-SD-NEXT: fcvt d1, h3
-; CHECK-SD-NEXT: mov v0.d[1], v4.d[0]
-; CHECK-SD-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-SD-NEXT: fcvtl2 v1.2d, v0.4s
+; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fpext_v4f16_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 9425988af83491..3efc98ab5fd532 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -84,11 +84,8 @@ entry:
define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
; CHECK-SD-LABEL: fptrunc_v2f64_v2f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov d1, v0.d[1]
-; CHECK-SD-NEXT: fcvt h0, d0
-; CHECK-SD-NEXT: fcvt h1, d1
-; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptrunc_v2f64_v2f16:
@@ -135,16 +132,9 @@ entry:
define <4 x half> @fptrunc_v4f64_v4f16(<4 x double> %a) {
; CHECK-SD-LABEL: fptrunc_v4f64_v4f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov d2, v0.d[1]
-; CHECK-SD-NEXT: fcvt h0, d0
-; CHECK-SD-NEXT: fcvt h2, d2
-; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
-; CHECK-SD-NEXT: fcvt h2, d1
-; CHECK-SD-NEXT: mov d1, v1.d[1]
-; CHECK-SD-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-SD-NEXT: fcvt h1, d1
-; CHECK-SD-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptrunc_v4f64_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index da9b79a56a9518..2ace0bca274af1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -427,49 +427,35 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
; SVE-LABEL: test_copysign_v4f16_v4f64:
; SVE: // %bb.0:
-; SVE-NEXT: sub sp, sp, #16
-; SVE-NEXT: .cfi_def_cfa_offset 16
-; SVE-NEXT: ldp q1, q0, [x1]
-; SVE-NEXT: ldr d4, [x0]
-; SVE-NEXT: and z4.h, z4.h, #0x7fff
-; SVE-NEXT: mov z2.d, z0.d[1]
-; SVE-NEXT: mov z3.d, z1.d[1]
-; SVE-NEXT: fcvt h0, d0
-; SVE-NEXT: fcvt h1, d1
-; SVE-NEXT: fcvt h2, d2
-; SVE-NEXT: fcvt h3, d3
-; SVE-NEXT: str h0, [sp, #12]
-; SVE-NEXT: str h1, [sp, #8]
-; SVE-NEXT: str h2, [sp, #14]
-; SVE-NEXT: str h3, [sp, #10]
-; SVE-NEXT: ldr d0, [sp, #8]
+; SVE-NEXT: ldp q0, q1, [x1]
+; SVE-NEXT: ptrue p0.s, vl2
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: fcvtxn v1.2s, v1.2d
+; SVE-NEXT: fcvtxn v0.2s, v0.2d
+; SVE-NEXT: splice z0.s, p0, z0.s, z1.s
+; SVE-NEXT: ldr d1, [x0]
+; SVE-NEXT: and z1.h, z1.h, #0x7fff
+; SVE-NEXT: fcvt z0.h, p1/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z0.h
; SVE-NEXT: and z0.h, z0.h, #0x8000
-; SVE-NEXT: orr z0.d, z4.d, z0.d
+; SVE-NEXT: orr z0.d, z1.d, z0.d
; SVE-NEXT: str d0, [x0]
-; SVE-NEXT: add sp, sp, #16
; SVE-NEXT: ret
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
-; SVE2-NEXT: sub sp, sp, #16
-; SVE2-NEXT: .cfi_def_cfa_offset 16
-; SVE2-NEXT: ldp q2, q1, [x1]
-; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT: ldr d5, [x0]
-; SVE2-NEXT: mov z3.d, z1.d[1]
-; SVE2-NEXT: mov z4.d, z2.d[1]
-; SVE2-NEXT: fcvt h1, d1
-; SVE2-NEXT: fcvt h2, d2
-; SVE2-NEXT: fcvt h3, d3
-; SVE2-NEXT: fcvt h4, d4
-; SVE2-NEXT: str h1, [sp, #12]
-; SVE2-NEXT: str h2, [sp, #8]
-; SVE2-NEXT: str h3, [sp, #14]
-; SVE2-NEXT: str h4, [sp, #10]
-; SVE2-NEXT: ldr d1, [sp, #8]
-; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d
-; SVE2-NEXT: str d5, [x0]
-; SVE2-NEXT: add sp, sp, #16
+; SVE2-NEXT: ldp q0, q1, [x1]
+; SVE2-NEXT: ptrue p0.s, vl2
+; SVE2-NEXT: ptrue p1.s
+; SVE2-NEXT: ldr d2, [x0]
+; SVE2-NEXT: fcvtxn v1.2s, v1.2d
+; SVE2-NEXT: fcvtxn v0.2s, v0.2d
+; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
+; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
+; SVE2-NEXT: fcvt z0.h, p1/m, z0.s
+; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
+; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT: str d2, [x0]
; SVE2-NEXT: ret
%a = load <4 x half>, ptr %ap
%b = load <4 x double>, ptr %bp
diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index c33759331bbc8d..de26676b5c73ee 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -209,16 +209,10 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4f16_v4f64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov d3, v1[1]
-; CHECK-NEXT: fcvt h1, d1
-; CHECK-NEXT: fcvt h3, d3
-; CHECK-NEXT: mov.h v1[1], v3[0]
-; CHECK-NEXT: fcvt h3, d2
-; CHECK-NEXT: mov d2, v2[1]
-; CHECK-NEXT: mov.h v1[2], v3[0]
-; CHECK-NEXT: fcvt h2, d2
-; CHECK-NEXT: mov.h v1[3], v2[0]
+; CHECK-NEXT: fcvtxn v1.2s, v1.2d
+; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d
; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: fcvtn v1.4h, v1.4s
; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x half>
@@ -291,42 +285,20 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b)
define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4bf16_v4f64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov d3, v1[1]
-; CHECK-NEXT: fcvtxn s1, d1
-; CHECK-NEXT: mov w8, #32767 ; =0x7fff
-; CHECK-NEXT: fcvtxn s3, d3
-; CHECK-NEXT: fmov w10, s1
-; CHECK-NEXT: ubfx w12, w10, #16, #1
-; CHECK-NEXT: add w10, w10, w8
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: fcvtxn s3, d2
-; CHECK-NEXT: mov d2, v2[1]
-; CHECK-NEXT: add w10, w12, w10
-; CHECK-NEXT: lsr w10, w10, #16
-; CHECK-NEXT: ubfx w11, w9, #16, #1
-; CHECK-NEXT: add w9, w9, w8
-; CHECK-NEXT: fcvtxn s1, d2
-; CHECK-NEXT: add w9, w11, w9
-; CHECK-NEXT: fmov w11, s3
-; CHECK-NEXT: fmov s3, w10
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: ubfx w12, w11, #16, #1
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: add w9, w11, w8
-; CHECK-NEXT: fmov w10, s1
-; CHECK-NEXT: add w9, w12, w9
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: mov.h v3[1], v2[0]
-; CHECK-NEXT: ubfx w11, w10, #16, #1
-; CHECK-NEXT: add w8, w10, w8
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: add w8, w11, w8
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v3[2], v1[0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov.h v3[3], v1[0]
-; CHECK-NEXT: mvni.4h v1, #128, lsl #8
-; CHECK-NEXT: bif.8b v0, v3, v1
+; CHECK-NEXT: fcvtxn v1.2s, v1.2d
+; CHECK-NEXT: movi.4s v3, #1
+; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d
+; CHECK-NEXT: movi.4s v2, #127, msl #8
+; CHECK-NEXT: ushr.4s v4, v1, #16
+; CHECK-NEXT: add.4s v2, v1, v2
+; CHECK-NEXT: and.16b v3, v4, v3
+; CHECK-NEXT: add.4s v2, v3, v2
+; CHECK-NEXT: fcmeq.4s v3, v1, v1
+; CHECK-NEXT: orr.4s v1, #64, lsl #16
+; CHECK-NEXT: bit.16b v1, v2, v3
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: shrn.4h v1, v1, #16
+; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x bfloat>
%r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %tmp0)
More information about the llvm-commits
mailing list