[llvm] 930e7ff - [AArch64] Optimize abs, neg and copysign for fp16/bf16
David Majnemer via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 4 12:05:13 PST 2024
Author: David Majnemer
Date: 2024-03-04T20:05:05Z
New Revision: 930e7ff9aee24140e8c11f6527ba0e3e2208b55c
URL: https://github.com/llvm/llvm-project/commit/930e7ff9aee24140e8c11f6527ba0e3e2208b55c
DIFF: https://github.com/llvm/llvm-project/commit/930e7ff9aee24140e8c11f6527ba0e3e2208b55c.diff
LOG: [AArch64] Optimize abs, neg and copysign for fp16/bf16
We can use bitwise arithmetic to implement these, making them
considerably faster than legalization via promotion.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/f16-instructions.ll
llvm/test/CodeGen/AArch64/fabs.ll
llvm/test/CodeGen/AArch64/fcopysign.ll
llvm/test/CodeGen/AArch64/fneg.ll
llvm/test/CodeGen/AArch64/vector-fcopysign.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5af89bc5192a1b..63725f840b6fcb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -676,11 +676,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
- if (Subtarget->hasFullFP16())
+ if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
- else
+ setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Custom);
+ } else {
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
- setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
+ }
for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
@@ -699,23 +701,48 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
auto LegalizeNarrowFP = [this](MVT ScalarVT) {
- for (auto Op :
- {ISD::SETCC, ISD::SELECT_CC,
- ISD::BR_CC, ISD::FADD, ISD::FSUB,
- ISD::FMUL, ISD::FDIV, ISD::FMA,
- ISD::FNEG, ISD::FABS, ISD::FCEIL,
- ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
- ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
- ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
- ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
- ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
- ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
- ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
- ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
- ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
- ISD::STRICT_FMAXIMUM})
+ for (auto Op : {ISD::SETCC,
+ ISD::SELECT_CC,
+ ISD::BR_CC,
+ ISD::FADD,
+ ISD::FSUB,
+ ISD::FMUL,
+ ISD::FDIV,
+ ISD::FMA,
+ ISD::FCEIL,
+ ISD::FSQRT,
+ ISD::FFLOOR,
+ ISD::FNEARBYINT,
+ ISD::FRINT,
+ ISD::FROUND,
+ ISD::FROUNDEVEN,
+ ISD::FTRUNC,
+ ISD::FMINNUM,
+ ISD::FMAXNUM,
+ ISD::FMINIMUM,
+ ISD::FMAXIMUM,
+ ISD::STRICT_FADD,
+ ISD::STRICT_FSUB,
+ ISD::STRICT_FMUL,
+ ISD::STRICT_FDIV,
+ ISD::STRICT_FMA,
+ ISD::STRICT_FCEIL,
+ ISD::STRICT_FFLOOR,
+ ISD::STRICT_FSQRT,
+ ISD::STRICT_FRINT,
+ ISD::STRICT_FNEARBYINT,
+ ISD::STRICT_FROUND,
+ ISD::STRICT_FTRUNC,
+ ISD::STRICT_FROUNDEVEN,
+ ISD::STRICT_FMINNUM,
+ ISD::STRICT_FMAXNUM,
+ ISD::STRICT_FMINIMUM,
+ ISD::STRICT_FMAXIMUM})
setOperationAction(Op, ScalarVT, Promote);
+ for (auto Op : {ISD::FNEG, ISD::FABS})
+ setOperationAction(Op, ScalarVT, Legal);
+
// Round-to-integer need custom lowering for fp16, as Promote doesn't work
// because the result type is integer.
for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
@@ -730,8 +757,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
- setOperationAction(ISD::FABS, V4Narrow, Expand);
- setOperationAction(ISD::FNEG, V4Narrow, Expand);
+ setOperationAction(ISD::FABS, V4Narrow, Legal);
+ setOperationAction(ISD::FNEG, V4Narrow, Legal);
setOperationAction(ISD::FROUND, V4Narrow, Expand);
setOperationAction(ISD::FROUNDEVEN, V4Narrow, Expand);
setOperationAction(ISD::FMA, V4Narrow, Expand);
@@ -740,7 +767,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, V4Narrow, Expand);
setOperationAction(ISD::SELECT_CC, V4Narrow, Expand);
setOperationAction(ISD::FTRUNC, V4Narrow, Expand);
- setOperationAction(ISD::FCOPYSIGN, V4Narrow, Expand);
+ setOperationAction(ISD::FCOPYSIGN, V4Narrow, Custom);
setOperationAction(ISD::FFLOOR, V4Narrow, Expand);
setOperationAction(ISD::FCEIL, V4Narrow, Expand);
setOperationAction(ISD::FRINT, V4Narrow, Expand);
@@ -748,16 +775,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSQRT, V4Narrow, Expand);
auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
- setOperationAction(ISD::FABS, V8Narrow, Expand);
+ setOperationAction(ISD::FABS, V8Narrow, Legal);
setOperationAction(ISD::FADD, V8Narrow, Expand);
setOperationAction(ISD::FCEIL, V8Narrow, Expand);
- setOperationAction(ISD::FCOPYSIGN, V8Narrow, Expand);
+ setOperationAction(ISD::FCOPYSIGN, V8Narrow, Custom);
setOperationAction(ISD::FDIV, V8Narrow, Expand);
setOperationAction(ISD::FFLOOR, V8Narrow, Expand);
setOperationAction(ISD::FMA, V8Narrow, Expand);
setOperationAction(ISD::FMUL, V8Narrow, Expand);
setOperationAction(ISD::FNEARBYINT, V8Narrow, Expand);
- setOperationAction(ISD::FNEG, V8Narrow, Expand);
+ setOperationAction(ISD::FNEG, V8Narrow, Legal);
setOperationAction(ISD::FROUND, V8Narrow, Expand);
setOperationAction(ISD::FROUNDEVEN, V8Narrow, Expand);
setOperationAction(ISD::FRINT, V8Narrow, Expand);
@@ -1745,7 +1772,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
// But we do support custom-lowering for FCOPYSIGN.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
+ ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16) &&
+ Subtarget->hasFullFP16()))
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -9208,7 +9237,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
} else if (VT == MVT::f32) {
VecVT = MVT::v4i32;
SetVecVal(AArch64::ssub);
- } else if (VT == MVT::f16) {
+ } else if (VT == MVT::f16 || VT == MVT::bf16) {
VecVT = MVT::v8i16;
SetVecVal(AArch64::hsub);
} else {
@@ -9230,7 +9259,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SDValue BSP =
DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
- if (VT == MVT::f16)
+ if (VT == MVT::f16 || VT == MVT::bf16)
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
if (VT == MVT::f32)
return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c153bb3f0145bc..2f3aaf86d376f6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5028,9 +5028,6 @@ defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu
defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
(FCVTNv4i16 V128:$Rn)>;
-//def : Pat<(concat_vectors V64:$Rd,
-// (v4bf16 (any_fpround (v4f32 V128:$Rn)))),
-// (FCVTNv8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
def : Pat<(concat_vectors V64:$Rd,
(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
@@ -7813,6 +7810,42 @@ def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
}
+def abs_f16 :
+ OutPatFrag<(ops node:$Rn),
+ (EXTRACT_SUBREG (f32 (COPY_TO_REGCLASS
+ (i32 (ANDWri
+ (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ node:$Rn, hsub), GPR32)),
+ (i32 (logical_imm32_XFORM(i32 0x7fff))))),
+ FPR32)), hsub)>;
+
+def : Pat<(f16 (fabs (f16 FPR16:$Rn))), (f16 (abs_f16 (f16 FPR16:$Rn)))>;
+def : Pat<(bf16 (fabs (bf16 FPR16:$Rn))), (bf16 (abs_f16 (bf16 FPR16:$Rn)))>;
+
+def neg_f16 :
+ OutPatFrag<(ops node:$Rn),
+ (EXTRACT_SUBREG (f32 (COPY_TO_REGCLASS
+ (i32 (EORWri
+ (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+ node:$Rn, hsub), GPR32)),
+ (i32 (logical_imm32_XFORM(i32 0x8000))))),
+ FPR32)), hsub)>;
+
+def : Pat<(f16 (fneg (f16 FPR16:$Rn))), (f16 (neg_f16 (f16 FPR16:$Rn)))>;
+def : Pat<(bf16 (fneg (bf16 FPR16:$Rn))), (bf16 (neg_f16 (bf16 FPR16:$Rn)))>;
+
+let Predicates = [HasNEON] in {
+def : Pat<(v4f16 (fabs (v4f16 V64:$Rn))), (v4f16 (BICv4i16 (v4f16 V64:$Rn), (i32 128), (i32 8)))>;
+def : Pat<(v4bf16 (fabs (v4bf16 V64:$Rn))), (v4bf16 (BICv4i16 (v4bf16 V64:$Rn), (i32 128), (i32 8)))>;
+def : Pat<(v8f16 (fabs (v8f16 V128:$Rn))), (v8f16 (BICv8i16 (v8f16 V128:$Rn), (i32 128), (i32 8)))>;
+def : Pat<(v8bf16 (fabs (v8bf16 V128:$Rn))), (v8bf16 (BICv8i16 (v8bf16 V128:$Rn), (i32 128), (i32 8)))>;
+
+def : Pat<(v4f16 (fneg (v4f16 V64:$Rn))), (v4f16 (EORv8i8 (v4f16 V64:$Rn), (MOVIv4i16 (i32 128), (i32 8))))>;
+def : Pat<(v4bf16 (fneg (v4bf16 V64:$Rn))), (v4bf16 (EORv8i8 (v4bf16 V64:$Rn), (v4i16 (MOVIv4i16 (i32 0x80), (i32 8)))))>;
+def : Pat<(v8f16 (fneg (v8f16 V128:$Rn))), (v8f16 (EORv16i8 (v8f16 V128:$Rn), (MOVIv8i16 (i32 128), (i32 8))))>;
+def : Pat<(v8bf16 (fneg (v8bf16 V128:$Rn))), (v8bf16 (EORv16i8 (v8bf16 V128:$Rn), (v8i16 (MOVIv8i16 (i32 0x80), (i32 8)))))>;
+}
+
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// These patterns are more complex because floating point loads do not
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 0be41f246512ff..fa362ae798aa8f 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1027,9 +1027,9 @@ define half @test_fma(half %a, half %b, half %c) #0 {
}
; CHECK-CVT-LABEL: test_fabs:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fabs s0, s0
-; CHECK-CVT-NEXT: fcvt h0, s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: and w8, w8, #0x7fff
+; CHECK-CVT-NEXT: fmov s0, w8
; CHECK-CVT-NEXT: ret
; CHECK-FP16-LABEL: test_fabs:
@@ -1338,3 +1338,12 @@ define half @test_fmuladd(half %a, half %b, half %c) #0 {
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-COMMON: {{.*}}
+; CHECK-CVT: {{.*}}
+; CHECK-FP16: {{.*}}
+; FALLBACK: {{.*}}
+; FALLBACK-FP16: {{.*}}
+; GISEL: {{.*}}
+; GISEL-CVT: {{.*}}
+; GISEL-FP16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 5462bc65fd346f..7c13b49246d230 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -27,9 +27,11 @@ entry:
define half @fabs_f16(half %a) {
; CHECK-SD-NOFP16-LABEL: fabs_f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fabs s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-SD-NOFP16-NEXT: fmov w8, s0
+; CHECK-SD-NOFP16-NEXT: and w8, w8, #0x7fff
+; CHECK-SD-NOFP16-NEXT: fmov s0, w8
+; CHECK-SD-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_f16:
@@ -148,45 +150,7 @@ entry:
define <7 x half> @fabs_v7f16(<7 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v7f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fabs s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fabs s5, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fabs s2, s3
-; CHECK-SD-NOFP16-NEXT: fabs s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fabs s5, s5
-; CHECK-SD-NOFP16-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fabs s3, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fabs s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT: fabs s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT: bic v0.8h, #128, lsl #8
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v7f16:
@@ -234,26 +198,7 @@ entry:
define <4 x half> @fabs_v4f16(<4 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v4f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fabs s0, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT: fabs s1, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fabs s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fabs s2, s3
-; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT: bic v0.4h, #128, lsl #8
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v4f16:
@@ -280,45 +225,7 @@ entry:
define <8 x half> @fabs_v8f16(<8 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v8f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fabs s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fabs s5, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fabs s2, s3
-; CHECK-SD-NOFP16-NEXT: fabs s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fabs s5, s5
-; CHECK-SD-NOFP16-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fabs s3, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fabs s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT: fabs s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT: bic v0.8h, #128, lsl #8
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v8f16:
@@ -348,84 +255,8 @@ entry:
define <16 x half> @fabs_v16f16(<16 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v16f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fabs s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fabs s6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: fabs s18, s2
-; CHECK-SD-NOFP16-NEXT: fabs s19, s3
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT: fabs s4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT: fabs s6, s7
-; CHECK-SD-NOFP16-NEXT: fabs s16, s16
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT: fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fabs s5, s17
-; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT: fabs s18, s18
-; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fabs s4, s19
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT: fabs s7, s7
-; CHECK-SD-NOFP16-NEXT: fabs s16, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fabs s4, s6
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT: fabs s7, s17
-; CHECK-SD-NOFP16-NEXT: fabs s0, s0
-; CHECK-SD-NOFP16-NEXT: fabs s1, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT: bic v0.8h, #128, lsl #8
+; CHECK-SD-NOFP16-NEXT: bic v1.8h, #128, lsl #8
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v16f16:
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 89e78f71474905..78fd38ca9f268a 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -199,63 +199,8 @@ entry:
define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-SD-LABEL: copysign_v7f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov h2, v1.h[1]
-; CHECK-SD-NEXT: mov h4, v0.h[1]
-; CHECK-SD-NEXT: fcvt s5, h1
-; CHECK-SD-NEXT: fcvt s6, h0
-; CHECK-SD-NEXT: mvni v3.4s, #128, lsl #24
-; CHECK-SD-NEXT: mov h7, v1.h[2]
-; CHECK-SD-NEXT: mov h16, v0.h[2]
-; CHECK-SD-NEXT: mov h17, v1.h[3]
-; CHECK-SD-NEXT: fcvt s2, h2
-; CHECK-SD-NEXT: fcvt s4, h4
-; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b
-; CHECK-SD-NEXT: mov h6, v0.h[3]
-; CHECK-SD-NEXT: fcvt s7, h7
-; CHECK-SD-NEXT: fcvt s16, h16
-; CHECK-SD-NEXT: fcvt s17, h17
-; CHECK-SD-NEXT: bif v4.16b, v2.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h2, s5
-; CHECK-SD-NEXT: mov v5.16b, v3.16b
-; CHECK-SD-NEXT: fcvt s6, h6
-; CHECK-SD-NEXT: bsl v5.16b, v16.16b, v7.16b
-; CHECK-SD-NEXT: fcvt h4, s4
-; CHECK-SD-NEXT: mov h7, v1.h[4]
-; CHECK-SD-NEXT: mov h16, v0.h[4]
-; CHECK-SD-NEXT: bif v6.16b, v17.16b, v3.16b
-; CHECK-SD-NEXT: mov h17, v0.h[5]
-; CHECK-SD-NEXT: fcvt h5, s5
-; CHECK-SD-NEXT: mov v2.h[1], v4.h[0]
-; CHECK-SD-NEXT: fcvt s4, h7
-; CHECK-SD-NEXT: fcvt s7, h16
-; CHECK-SD-NEXT: mov h16, v1.h[5]
-; CHECK-SD-NEXT: fcvt h6, s6
-; CHECK-SD-NEXT: fcvt s17, h17
-; CHECK-SD-NEXT: mov v2.h[2], v5.h[0]
-; CHECK-SD-NEXT: mov h5, v1.h[6]
-; CHECK-SD-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NEXT: bit v4.16b, v7.16b, v3.16b
-; CHECK-SD-NEXT: mov h7, v0.h[6]
-; CHECK-SD-NEXT: fcvt s16, h16
-; CHECK-SD-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NEXT: mov v2.h[3], v6.h[0]
-; CHECK-SD-NEXT: fcvt s5, h5
-; CHECK-SD-NEXT: fcvt s1, h1
-; CHECK-SD-NEXT: fcvt s6, h7
-; CHECK-SD-NEXT: mov v7.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h4, s4
-; CHECK-SD-NEXT: fcvt s0, h0
-; CHECK-SD-NEXT: bsl v7.16b, v17.16b, v16.16b
-; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b
-; CHECK-SD-NEXT: mov v2.h[4], v4.h[0]
-; CHECK-SD-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h4, s7
-; CHECK-SD-NEXT: fcvt h0, s0
-; CHECK-SD-NEXT: mov v2.h[5], v4.h[0]
-; CHECK-SD-NEXT: fcvt h4, s5
-; CHECK-SD-NEXT: mov v2.h[6], v4.h[0]
-; CHECK-SD-NEXT: mov v2.h[7], v0.h[0]
-; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mvni v2.8h, #128, lsl #8
+; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: copysign_v7f16:
@@ -290,246 +235,34 @@ entry:
}
define <4 x half> @copysign_v4f16(<4 x half> %a, <4 x half> %b) {
-; CHECK-SD-LABEL: copysign_v4f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov h3, v1.h[1]
-; CHECK-SD-NEXT: mov h4, v0.h[1]
-; CHECK-SD-NEXT: mov h5, v1.h[2]
-; CHECK-SD-NEXT: mov h6, v0.h[2]
-; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24
-; CHECK-SD-NEXT: fcvt s7, h1
-; CHECK-SD-NEXT: fcvt s16, h0
-; CHECK-SD-NEXT: mov h1, v1.h[3]
-; CHECK-SD-NEXT: fcvt s3, h3
-; CHECK-SD-NEXT: fcvt s4, h4
-; CHECK-SD-NEXT: fcvt s1, h1
-; CHECK-SD-NEXT: bit v3.16b, v4.16b, v2.16b
-; CHECK-SD-NEXT: fcvt s4, h5
-; CHECK-SD-NEXT: fcvt s5, h6
-; CHECK-SD-NEXT: mov v6.16b, v2.16b
-; CHECK-SD-NEXT: bsl v6.16b, v16.16b, v7.16b
-; CHECK-SD-NEXT: mov h7, v0.h[3]
-; CHECK-SD-NEXT: bit v4.16b, v5.16b, v2.16b
-; CHECK-SD-NEXT: fcvt h3, s3
-; CHECK-SD-NEXT: fcvt h0, s6
-; CHECK-SD-NEXT: fcvt s5, h7
-; CHECK-SD-NEXT: mov v0.h[1], v3.h[0]
-; CHECK-SD-NEXT: fcvt h3, s4
-; CHECK-SD-NEXT: bit v1.16b, v5.16b, v2.16b
-; CHECK-SD-NEXT: mov v0.h[2], v3.h[0]
-; CHECK-SD-NEXT: fcvt h1, s1
-; CHECK-SD-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: copysign_v4f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mvni v2.4h, #128, lsl #8
-; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: copysign_v4f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mvni v2.4h, #128, lsl #8
+; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: ret
entry:
%c = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %c
}
define <8 x half> @copysign_v8f16(<8 x half> %a, <8 x half> %b) {
-; CHECK-SD-LABEL: copysign_v8f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov h2, v1.h[1]
-; CHECK-SD-NEXT: mov h4, v0.h[1]
-; CHECK-SD-NEXT: fcvt s5, h1
-; CHECK-SD-NEXT: fcvt s6, h0
-; CHECK-SD-NEXT: mvni v3.4s, #128, lsl #24
-; CHECK-SD-NEXT: mov h7, v1.h[2]
-; CHECK-SD-NEXT: mov h16, v0.h[2]
-; CHECK-SD-NEXT: mov h17, v1.h[3]
-; CHECK-SD-NEXT: fcvt s2, h2
-; CHECK-SD-NEXT: fcvt s4, h4
-; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b
-; CHECK-SD-NEXT: mov h6, v0.h[3]
-; CHECK-SD-NEXT: fcvt s7, h7
-; CHECK-SD-NEXT: fcvt s16, h16
-; CHECK-SD-NEXT: fcvt s17, h17
-; CHECK-SD-NEXT: bif v4.16b, v2.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h2, s5
-; CHECK-SD-NEXT: mov v5.16b, v3.16b
-; CHECK-SD-NEXT: fcvt s6, h6
-; CHECK-SD-NEXT: bsl v5.16b, v16.16b, v7.16b
-; CHECK-SD-NEXT: fcvt h4, s4
-; CHECK-SD-NEXT: mov h7, v1.h[4]
-; CHECK-SD-NEXT: mov h16, v0.h[4]
-; CHECK-SD-NEXT: bif v6.16b, v17.16b, v3.16b
-; CHECK-SD-NEXT: mov h17, v0.h[5]
-; CHECK-SD-NEXT: fcvt h5, s5
-; CHECK-SD-NEXT: mov v2.h[1], v4.h[0]
-; CHECK-SD-NEXT: fcvt s4, h7
-; CHECK-SD-NEXT: fcvt s7, h16
-; CHECK-SD-NEXT: mov h16, v1.h[5]
-; CHECK-SD-NEXT: fcvt h6, s6
-; CHECK-SD-NEXT: fcvt s17, h17
-; CHECK-SD-NEXT: mov v2.h[2], v5.h[0]
-; CHECK-SD-NEXT: mov h5, v1.h[6]
-; CHECK-SD-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NEXT: bit v4.16b, v7.16b, v3.16b
-; CHECK-SD-NEXT: mov h7, v0.h[6]
-; CHECK-SD-NEXT: fcvt s16, h16
-; CHECK-SD-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NEXT: mov v2.h[3], v6.h[0]
-; CHECK-SD-NEXT: fcvt s5, h5
-; CHECK-SD-NEXT: fcvt s1, h1
-; CHECK-SD-NEXT: fcvt s6, h7
-; CHECK-SD-NEXT: mov v7.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h4, s4
-; CHECK-SD-NEXT: fcvt s0, h0
-; CHECK-SD-NEXT: bsl v7.16b, v17.16b, v16.16b
-; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b
-; CHECK-SD-NEXT: mov v2.h[4], v4.h[0]
-; CHECK-SD-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-SD-NEXT: fcvt h4, s7
-; CHECK-SD-NEXT: fcvt h0, s0
-; CHECK-SD-NEXT: mov v2.h[5], v4.h[0]
-; CHECK-SD-NEXT: fcvt h4, s5
-; CHECK-SD-NEXT: mov v2.h[6], v4.h[0]
-; CHECK-SD-NEXT: mov v2.h[7], v0.h[0]
-; CHECK-SD-NEXT: mov v0.16b, v2.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: copysign_v8f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mvni v2.8h, #128, lsl #8
-; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: copysign_v8f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mvni v2.8h, #128, lsl #8
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
%c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
ret <8 x half> %c
}
define <16 x half> @copysign_v16f16(<16 x half> %a, <16 x half> %b) {
-; CHECK-SD-LABEL: copysign_v16f16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov h4, v2.h[1]
-; CHECK-SD-NEXT: mov h5, v0.h[1]
-; CHECK-SD-NEXT: mvni v6.4s, #128, lsl #24
-; CHECK-SD-NEXT: mov h7, v3.h[1]
-; CHECK-SD-NEXT: mov h16, v1.h[1]
-; CHECK-SD-NEXT: fcvt s17, h2
-; CHECK-SD-NEXT: fcvt s18, h0
-; CHECK-SD-NEXT: mov h19, v2.h[2]
-; CHECK-SD-NEXT: mov h20, v0.h[2]
-; CHECK-SD-NEXT: fcvt s21, h3
-; CHECK-SD-NEXT: fcvt s22, h1
-; CHECK-SD-NEXT: mov h23, v3.h[2]
-; CHECK-SD-NEXT: fcvt s4, h4
-; CHECK-SD-NEXT: fcvt s5, h5
-; CHECK-SD-NEXT: mov h24, v1.h[2]
-; CHECK-SD-NEXT: fcvt s7, h7
-; CHECK-SD-NEXT: fcvt s16, h16
-; CHECK-SD-NEXT: mov h25, v1.h[3]
-; CHECK-SD-NEXT: mov h26, v1.h[6]
-; CHECK-SD-NEXT: bit v21.16b, v22.16b, v6.16b
-; CHECK-SD-NEXT: fcvt s22, h23
-; CHECK-SD-NEXT: bit v4.16b, v5.16b, v6.16b
-; CHECK-SD-NEXT: mov v5.16b, v6.16b
-; CHECK-SD-NEXT: fcvt s23, h24
-; CHECK-SD-NEXT: bit v7.16b, v16.16b, v6.16b
-; CHECK-SD-NEXT: mov h24, v3.h[3]
-; CHECK-SD-NEXT: bsl v5.16b, v18.16b, v17.16b
-; CHECK-SD-NEXT: fcvt s18, h19
-; CHECK-SD-NEXT: fcvt s19, h20
-; CHECK-SD-NEXT: mov h20, v0.h[3]
-; CHECK-SD-NEXT: mov h17, v2.h[3]
-; CHECK-SD-NEXT: fcvt h16, s4
-; CHECK-SD-NEXT: fcvt h7, s7
-; CHECK-SD-NEXT: fcvt h4, s5
-; CHECK-SD-NEXT: bit v18.16b, v19.16b, v6.16b
-; CHECK-SD-NEXT: fcvt h5, s21
-; CHECK-SD-NEXT: fcvt s19, h20
-; CHECK-SD-NEXT: mov v20.16b, v6.16b
-; CHECK-SD-NEXT: fcvt s17, h17
-; CHECK-SD-NEXT: fcvt s21, h25
-; CHECK-SD-NEXT: mov h25, v0.h[6]
-; CHECK-SD-NEXT: bsl v20.16b, v23.16b, v22.16b
-; CHECK-SD-NEXT: mov v4.h[1], v16.h[0]
-; CHECK-SD-NEXT: fcvt s16, h24
-; CHECK-SD-NEXT: fcvt h18, s18
-; CHECK-SD-NEXT: mov h22, v2.h[4]
-; CHECK-SD-NEXT: mov h23, v0.h[4]
-; CHECK-SD-NEXT: bit v17.16b, v19.16b, v6.16b
-; CHECK-SD-NEXT: mov h19, v3.h[4]
-; CHECK-SD-NEXT: mov h24, v1.h[4]
-; CHECK-SD-NEXT: mov v5.h[1], v7.h[0]
-; CHECK-SD-NEXT: fcvt h7, s20
-; CHECK-SD-NEXT: bit v16.16b, v21.16b, v6.16b
-; CHECK-SD-NEXT: mov v4.h[2], v18.h[0]
-; CHECK-SD-NEXT: fcvt s18, h22
-; CHECK-SD-NEXT: fcvt s20, h23
-; CHECK-SD-NEXT: fcvt h17, s17
-; CHECK-SD-NEXT: fcvt s19, h19
-; CHECK-SD-NEXT: fcvt s21, h24
-; CHECK-SD-NEXT: mov h22, v2.h[5]
-; CHECK-SD-NEXT: mov h23, v0.h[5]
-; CHECK-SD-NEXT: mov h24, v1.h[5]
-; CHECK-SD-NEXT: mov v5.h[2], v7.h[0]
-; CHECK-SD-NEXT: fcvt h7, s16
-; CHECK-SD-NEXT: mov h16, v3.h[5]
-; CHECK-SD-NEXT: bit v18.16b, v20.16b, v6.16b
-; CHECK-SD-NEXT: mov h20, v2.h[6]
-; CHECK-SD-NEXT: mov h2, v2.h[7]
-; CHECK-SD-NEXT: bit v19.16b, v21.16b, v6.16b
-; CHECK-SD-NEXT: mov h21, v3.h[6]
-; CHECK-SD-NEXT: mov v4.h[3], v17.h[0]
-; CHECK-SD-NEXT: fcvt s17, h22
-; CHECK-SD-NEXT: fcvt s22, h23
-; CHECK-SD-NEXT: fcvt s23, h25
-; CHECK-SD-NEXT: mov v5.h[3], v7.h[0]
-; CHECK-SD-NEXT: fcvt s7, h16
-; CHECK-SD-NEXT: fcvt s16, h24
-; CHECK-SD-NEXT: fcvt h18, s18
-; CHECK-SD-NEXT: fcvt s20, h20
-; CHECK-SD-NEXT: fcvt s24, h26
-; CHECK-SD-NEXT: fcvt h19, s19
-; CHECK-SD-NEXT: fcvt s21, h21
-; CHECK-SD-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NEXT: bit v17.16b, v22.16b, v6.16b
-; CHECK-SD-NEXT: mov h3, v3.h[7]
-; CHECK-SD-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NEXT: bit v7.16b, v16.16b, v6.16b
-; CHECK-SD-NEXT: mov v16.16b, v6.16b
-; CHECK-SD-NEXT: fcvt s2, h2
-; CHECK-SD-NEXT: mov v4.h[4], v18.h[0]
-; CHECK-SD-NEXT: mov v18.16b, v6.16b
-; CHECK-SD-NEXT: mov v5.h[4], v19.h[0]
-; CHECK-SD-NEXT: fcvt s0, h0
-; CHECK-SD-NEXT: bsl v16.16b, v23.16b, v20.16b
-; CHECK-SD-NEXT: fcvt h17, s17
-; CHECK-SD-NEXT: fcvt s3, h3
-; CHECK-SD-NEXT: bsl v18.16b, v24.16b, v21.16b
-; CHECK-SD-NEXT: fcvt h7, s7
-; CHECK-SD-NEXT: fcvt s1, h1
-; CHECK-SD-NEXT: bif v0.16b, v2.16b, v6.16b
-; CHECK-SD-NEXT: mov v4.h[5], v17.h[0]
-; CHECK-SD-NEXT: fcvt h2, s16
-; CHECK-SD-NEXT: mov v5.h[5], v7.h[0]
-; CHECK-SD-NEXT: fcvt h7, s18
-; CHECK-SD-NEXT: bif v1.16b, v3.16b, v6.16b
-; CHECK-SD-NEXT: fcvt h0, s0
-; CHECK-SD-NEXT: mov v4.h[6], v2.h[0]
-; CHECK-SD-NEXT: mov v5.h[6], v7.h[0]
-; CHECK-SD-NEXT: fcvt h1, s1
-; CHECK-SD-NEXT: mov v4.h[7], v0.h[0]
-; CHECK-SD-NEXT: mov v5.h[7], v1.h[0]
-; CHECK-SD-NEXT: mov v0.16b, v4.16b
-; CHECK-SD-NEXT: mov v1.16b, v5.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: copysign_v16f16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mvni v4.8h, #128, lsl #8
-; CHECK-GI-NEXT: bif v0.16b, v2.16b, v4.16b
-; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: copysign_v16f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mvni v4.8h, #128, lsl #8
+; CHECK-NEXT: bif v0.16b, v2.16b, v4.16b
+; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b
+; CHECK-NEXT: ret
entry:
%c = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
ret <16 x half> %c
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index 7805512cbf45e3..d5010cf360841a 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -27,9 +27,11 @@ entry:
define half @fabs_f16(half %a) {
; CHECK-SD-NOFP16-LABEL: fabs_f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fneg s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-SD-NOFP16-NEXT: fmov w8, s0
+; CHECK-SD-NOFP16-NEXT: eor w8, w8, #0x8000
+; CHECK-SD-NOFP16-NEXT: fmov s0, w8
+; CHECK-SD-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_f16:
@@ -148,45 +150,8 @@ entry:
define <7 x half> @fabs_v7f16(<7 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v7f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fneg s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fneg s5, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fneg s2, s3
-; CHECK-SD-NOFP16-NEXT: fneg s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fneg s5, s5
-; CHECK-SD-NOFP16-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fneg s3, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fneg s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT: fneg s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT: movi v1.8h, #128, lsl #8
+; CHECK-SD-NOFP16-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v7f16:
@@ -234,26 +199,8 @@ entry:
define <4 x half> @fabs_v4f16(<4 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v4f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fneg s0, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT: fneg s1, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fneg s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fneg s2, s3
-; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT: movi v1.4h, #128, lsl #8
+; CHECK-SD-NOFP16-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v4f16:
@@ -280,45 +227,8 @@ entry:
define <8 x half> @fabs_v8f16(<8 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v8f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fneg s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fneg s5, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT: fneg s2, s3
-; CHECK-SD-NOFP16-NEXT: fneg s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fneg s5, s5
-; CHECK-SD-NOFP16-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fneg s3, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fneg s2, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT: fneg s0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT: mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT: movi v1.8h, #128, lsl #8
+; CHECK-SD-NOFP16-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v8f16:
@@ -348,84 +258,9 @@ entry:
define <16 x half> @fabs_v16f16(<16 x half> %a) {
; CHECK-SD-NOFP16-LABEL: fabs_v16f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT: mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fneg s4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fneg s6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: fneg s18, s2
-; CHECK-SD-NOFP16-NEXT: fneg s19, s3
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT: fneg s4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT: fneg s6, s7
-; CHECK-SD-NOFP16-NEXT: fneg s16, s16
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT: fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fneg s5, s17
-; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT: fneg s18, s18
-; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fneg s4, s19
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT: fneg s7, s7
-; CHECK-SD-NOFP16-NEXT: fneg s16, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fneg s4, s6
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT: fneg s7, s17
-; CHECK-SD-NOFP16-NEXT: fneg s0, s0
-; CHECK-SD-NOFP16-NEXT: fneg s1, s1
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT: movi v2.8h, #128, lsl #8
+; CHECK-SD-NOFP16-NEXT: eor v0.16b, v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT: eor v1.16b, v1.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fabs_v16f16:
diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index d01ca881545c02..c33759331bbc8d 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -185,142 +185,42 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
;============ v4f16
define <4 x half> @test_copysign_v4f16_v4f16(<4 x half> %a, <4 x half> %b) #0 {
-; NOFP16-LABEL: test_copysign_v4f16_v4f16:
-; NOFP16: ; %bb.0:
-; NOFP16-NEXT: ; kill: def $d1 killed $d1 def $q1
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
-; NOFP16-NEXT: mov h3, v1[1]
-; NOFP16-NEXT: mov h4, v0[1]
-; NOFP16-NEXT: mov h5, v1[2]
-; NOFP16-NEXT: mov h6, v0[2]
-; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
-; NOFP16-NEXT: fcvt s7, h1
-; NOFP16-NEXT: fcvt s16, h0
-; NOFP16-NEXT: mov h1, v1[3]
-; NOFP16-NEXT: fcvt s3, h3
-; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: fcvt s1, h1
-; NOFP16-NEXT: bit.16b v3, v4, v2
-; NOFP16-NEXT: fcvt s4, h5
-; NOFP16-NEXT: fcvt s5, h6
-; NOFP16-NEXT: mov.16b v6, v2
-; NOFP16-NEXT: bsl.16b v6, v16, v7
-; NOFP16-NEXT: mov h7, v0[3]
-; NOFP16-NEXT: bit.16b v4, v5, v2
-; NOFP16-NEXT: fcvt h3, s3
-; NOFP16-NEXT: fcvt h0, s6
-; NOFP16-NEXT: fcvt s5, h7
-; NOFP16-NEXT: mov.h v0[1], v3[0]
-; NOFP16-NEXT: fcvt h3, s4
-; NOFP16-NEXT: bit.16b v1, v5, v2
-; NOFP16-NEXT: mov.h v0[2], v3[0]
-; NOFP16-NEXT: fcvt h1, s1
-; NOFP16-NEXT: mov.h v0[3], v1[0]
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0
-; NOFP16-NEXT: ret
-;
-; FP16-LABEL: test_copysign_v4f16_v4f16:
-; FP16: ; %bb.0:
-; FP16-NEXT: mvni.4h v2, #128, lsl #8
-; FP16-NEXT: bif.8b v0, v1, v2
-; FP16-NEXT: ret
+; CHECK-LABEL: test_copysign_v4f16_v4f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: bif.8b v0, v1, v2
+; CHECK-NEXT: ret
%r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
ret <4 x half> %r
}
define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
-; NOFP16-LABEL: test_copysign_v4f16_v4f32:
-; NOFP16: ; %bb.0:
-; NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
-; NOFP16-NEXT: mov h3, v0[1]
-; NOFP16-NEXT: mov h5, v0[2]
-; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
-; NOFP16-NEXT: fcvt s7, h0
-; NOFP16-NEXT: mov h4, v1[1]
-; NOFP16-NEXT: fcvt s3, h3
-; NOFP16-NEXT: mov h6, v1[2]
-; NOFP16-NEXT: fcvt s16, h1
-; NOFP16-NEXT: mov h1, v1[3]
-; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: fcvt s1, h1
-; NOFP16-NEXT: bif.16b v3, v4, v2
-; NOFP16-NEXT: fcvt s4, h5
-; NOFP16-NEXT: fcvt s5, h6
-; NOFP16-NEXT: mov.16b v6, v2
-; NOFP16-NEXT: bsl.16b v6, v7, v16
-; NOFP16-NEXT: mov h7, v0[3]
-; NOFP16-NEXT: bif.16b v4, v5, v2
-; NOFP16-NEXT: fcvt h3, s3
-; NOFP16-NEXT: fcvt h0, s6
-; NOFP16-NEXT: fcvt s5, h7
-; NOFP16-NEXT: mov.h v0[1], v3[0]
-; NOFP16-NEXT: fcvt h3, s4
-; NOFP16-NEXT: bit.16b v1, v5, v2
-; NOFP16-NEXT: mov.h v0[2], v3[0]
-; NOFP16-NEXT: fcvt h1, s1
-; NOFP16-NEXT: mov.h v0[3], v1[0]
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0
-; NOFP16-NEXT: ret
-;
-; FP16-LABEL: test_copysign_v4f16_v4f32:
-; FP16: ; %bb.0:
-; FP16-NEXT: fcvtn v1.4h, v1.4s
-; FP16-NEXT: mvni.4h v2, #128, lsl #8
-; FP16-NEXT: bif.8b v0, v1, v2
-; FP16-NEXT: ret
+; CHECK-LABEL: test_copysign_v4f16_v4f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: bif.8b v0, v1, v2
+; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x float> %b to <4 x half>
%r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
ret <4 x half> %r
}
define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0 {
-; NOFP16-LABEL: test_copysign_v4f16_v4f64:
-; NOFP16: ; %bb.0:
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
-; NOFP16-NEXT: mov d3, v1[1]
-; NOFP16-NEXT: mov h4, v0[1]
-; NOFP16-NEXT: fcvt s1, d1
-; NOFP16-NEXT: fcvt s5, h0
-; NOFP16-NEXT: mov h7, v0[2]
-; NOFP16-NEXT: mvni.4s v6, #128, lsl #24
-; NOFP16-NEXT: fcvt s3, d3
-; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: bit.16b v1, v5, v6
-; NOFP16-NEXT: fcvt s7, h7
-; NOFP16-NEXT: mov h5, v0[3]
-; NOFP16-NEXT: bit.16b v3, v4, v6
-; NOFP16-NEXT: mov d4, v2[1]
-; NOFP16-NEXT: fcvt s2, d2
-; NOFP16-NEXT: fcvt h0, s1
-; NOFP16-NEXT: fcvt h1, s3
-; NOFP16-NEXT: bit.16b v2, v7, v6
-; NOFP16-NEXT: fcvt s3, d4
-; NOFP16-NEXT: fcvt s4, h5
-; NOFP16-NEXT: mov.h v0[1], v1[0]
-; NOFP16-NEXT: fcvt h1, s2
-; NOFP16-NEXT: mov.16b v2, v6
-; NOFP16-NEXT: bsl.16b v2, v4, v3
-; NOFP16-NEXT: mov.h v0[2], v1[0]
-; NOFP16-NEXT: fcvt h1, s2
-; NOFP16-NEXT: mov.h v0[3], v1[0]
-; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0
-; NOFP16-NEXT: ret
-;
-; FP16-LABEL: test_copysign_v4f16_v4f64:
-; FP16: ; %bb.0:
-; FP16-NEXT: mov d3, v1[1]
-; FP16-NEXT: fcvt h1, d1
-; FP16-NEXT: fcvt h3, d3
-; FP16-NEXT: mov.h v1[1], v3[0]
-; FP16-NEXT: fcvt h3, d2
-; FP16-NEXT: mov d2, v2[1]
-; FP16-NEXT: mov.h v1[2], v3[0]
-; FP16-NEXT: fcvt h2, d2
-; FP16-NEXT: mov.h v1[3], v2[0]
-; FP16-NEXT: mvni.4h v2, #128, lsl #8
-; FP16-NEXT: bif.8b v0, v1, v2
-; FP16-NEXT: ret
+; CHECK-LABEL: test_copysign_v4f16_v4f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov d3, v1[1]
+; CHECK-NEXT: fcvt h1, d1
+; CHECK-NEXT: fcvt h3, d3
+; CHECK-NEXT: mov.h v1[1], v3[0]
+; CHECK-NEXT: fcvt h3, d2
+; CHECK-NEXT: mov d2, v2[1]
+; CHECK-NEXT: mov.h v1[2], v3[0]
+; CHECK-NEXT: fcvt h2, d2
+; CHECK-NEXT: mov.h v1[3], v2[0]
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: bif.8b v0, v1, v2
+; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x half>
%r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
ret <4 x half> %r
@@ -331,145 +231,23 @@ declare <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) #0
;============ v8f16
define <8 x half> @test_copysign_v8f16_v8f16(<8 x half> %a, <8 x half> %b) #0 {
-; NOFP16-LABEL: test_copysign_v8f16_v8f16:
-; NOFP16: ; %bb.0:
-; NOFP16-NEXT: mov h2, v1[1]
-; NOFP16-NEXT: mov h4, v0[1]
-; NOFP16-NEXT: fcvt s5, h1
-; NOFP16-NEXT: fcvt s6, h0
-; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
-; NOFP16-NEXT: mov h7, v1[2]
-; NOFP16-NEXT: mov h16, v0[2]
-; NOFP16-NEXT: mov h17, v1[3]
-; NOFP16-NEXT: fcvt s2, h2
-; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: bit.16b v5, v6, v3
-; NOFP16-NEXT: mov h6, v0[3]
-; NOFP16-NEXT: fcvt s7, h7
-; NOFP16-NEXT: fcvt s16, h16
-; NOFP16-NEXT: fcvt s17, h17
-; NOFP16-NEXT: bif.16b v4, v2, v3
-; NOFP16-NEXT: fcvt h2, s5
-; NOFP16-NEXT: mov.16b v5, v3
-; NOFP16-NEXT: fcvt s6, h6
-; NOFP16-NEXT: bsl.16b v5, v16, v7
-; NOFP16-NEXT: fcvt h4, s4
-; NOFP16-NEXT: mov h7, v1[4]
-; NOFP16-NEXT: mov h16, v0[4]
-; NOFP16-NEXT: bif.16b v6, v17, v3
-; NOFP16-NEXT: mov h17, v0[5]
-; NOFP16-NEXT: fcvt h5, s5
-; NOFP16-NEXT: mov.h v2[1], v4[0]
-; NOFP16-NEXT: fcvt s4, h7
-; NOFP16-NEXT: fcvt s7, h16
-; NOFP16-NEXT: mov h16, v1[5]
-; NOFP16-NEXT: fcvt h6, s6
-; NOFP16-NEXT: fcvt s17, h17
-; NOFP16-NEXT: mov.h v2[2], v5[0]
-; NOFP16-NEXT: mov h5, v1[6]
-; NOFP16-NEXT: mov h1, v1[7]
-; NOFP16-NEXT: bit.16b v4, v7, v3
-; NOFP16-NEXT: mov h7, v0[6]
-; NOFP16-NEXT: fcvt s16, h16
-; NOFP16-NEXT: mov h0, v0[7]
-; NOFP16-NEXT: mov.h v2[3], v6[0]
-; NOFP16-NEXT: fcvt s5, h5
-; NOFP16-NEXT: fcvt s1, h1
-; NOFP16-NEXT: fcvt s6, h7
-; NOFP16-NEXT: mov.16b v7, v3
-; NOFP16-NEXT: fcvt h4, s4
-; NOFP16-NEXT: fcvt s0, h0
-; NOFP16-NEXT: bsl.16b v7, v17, v16
-; NOFP16-NEXT: bit.16b v5, v6, v3
-; NOFP16-NEXT: mov.h v2[4], v4[0]
-; NOFP16-NEXT: bif.16b v0, v1, v3
-; NOFP16-NEXT: fcvt h4, s7
-; NOFP16-NEXT: fcvt h0, s0
-; NOFP16-NEXT: mov.h v2[5], v4[0]
-; NOFP16-NEXT: fcvt h4, s5
-; NOFP16-NEXT: mov.h v2[6], v4[0]
-; NOFP16-NEXT: mov.h v2[7], v0[0]
-; NOFP16-NEXT: mov.16b v0, v2
-; NOFP16-NEXT: ret
-;
-; FP16-LABEL: test_copysign_v8f16_v8f16:
-; FP16: ; %bb.0:
-; FP16-NEXT: mvni.8h v2, #128, lsl #8
-; FP16-NEXT: bif.16b v0, v1, v2
-; FP16-NEXT: ret
+; CHECK-LABEL: test_copysign_v8f16_v8f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mvni.8h v2, #128, lsl #8
+; CHECK-NEXT: bif.16b v0, v1, v2
+; CHECK-NEXT: ret
%r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
ret <8 x half> %r
}
define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
-; NOFP16-LABEL: test_copysign_v8f16_v8f32:
-; NOFP16: ; %bb.0:
-; NOFP16-NEXT: fcvtn v1.4h, v1.4s
-; NOFP16-NEXT: mov h4, v0[1]
-; NOFP16-NEXT: fcvt s6, h0
-; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
-; NOFP16-NEXT: mov h7, v0[2]
-; NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; NOFP16-NEXT: mov h5, v1[1]
-; NOFP16-NEXT: fcvt s16, h1
-; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: mov h17, v1[2]
-; NOFP16-NEXT: mov h1, v1[3]
-; NOFP16-NEXT: fcvt s7, h7
-; NOFP16-NEXT: fcvt s5, h5
-; NOFP16-NEXT: bif.16b v6, v16, v3
-; NOFP16-NEXT: mov h16, v0[3]
-; NOFP16-NEXT: fcvt s17, h17
-; NOFP16-NEXT: fcvt s18, h1
-; NOFP16-NEXT: bif.16b v4, v5, v3
-; NOFP16-NEXT: fcvt h1, s6
-; NOFP16-NEXT: mov.16b v6, v3
-; NOFP16-NEXT: mov h5, v0[4]
-; NOFP16-NEXT: fcvt s16, h16
-; NOFP16-NEXT: bsl.16b v6, v7, v17
-; NOFP16-NEXT: mov h7, v0[5]
-; NOFP16-NEXT: mov h17, v2[1]
-; NOFP16-NEXT: fcvt h4, s4
-; NOFP16-NEXT: fcvt s5, h5
-; NOFP16-NEXT: bif.16b v16, v18, v3
-; NOFP16-NEXT: fcvt h6, s6
-; NOFP16-NEXT: fcvt s7, h7
-; NOFP16-NEXT: fcvt s17, h17
-; NOFP16-NEXT: mov.h v1[1], v4[0]
-; NOFP16-NEXT: fcvt s4, h2
-; NOFP16-NEXT: bif.16b v7, v17, v3
-; NOFP16-NEXT: bit.16b v4, v5, v3
-; NOFP16-NEXT: fcvt h5, s16
-; NOFP16-NEXT: mov.h v1[2], v6[0]
-; NOFP16-NEXT: mov h6, v0[6]
-; NOFP16-NEXT: mov h16, v2[2]
-; NOFP16-NEXT: mov h0, v0[7]
-; NOFP16-NEXT: mov h2, v2[3]
-; NOFP16-NEXT: mov.h v1[3], v5[0]
-; NOFP16-NEXT: fcvt h4, s4
-; NOFP16-NEXT: fcvt s5, h6
-; NOFP16-NEXT: fcvt s6, h16
-; NOFP16-NEXT: fcvt s0, h0
-; NOFP16-NEXT: fcvt s2, h2
-; NOFP16-NEXT: mov.h v1[4], v4[0]
-; NOFP16-NEXT: fcvt h4, s7
-; NOFP16-NEXT: bif.16b v5, v6, v3
-; NOFP16-NEXT: bif.16b v0, v2, v3
-; NOFP16-NEXT: mov.h v1[5], v4[0]
-; NOFP16-NEXT: fcvt h4, s5
-; NOFP16-NEXT: fcvt h0, s0
-; NOFP16-NEXT: mov.h v1[6], v4[0]
-; NOFP16-NEXT: mov.h v1[7], v0[0]
-; NOFP16-NEXT: mov.16b v0, v1
-; NOFP16-NEXT: ret
-;
-; FP16-LABEL: test_copysign_v8f16_v8f32:
-; FP16: ; %bb.0:
-; FP16-NEXT: fcvtn v1.4h, v1.4s
-; FP16-NEXT: fcvtn2 v1.8h, v2.4s
-; FP16-NEXT: mvni.8h v2, #128, lsl #8
-; FP16-NEXT: bif.16b v0, v1, v2
-; FP16-NEXT: ret
+; CHECK-LABEL: test_copysign_v8f16_v8f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-NEXT: fcvtn2 v1.8h, v2.4s
+; CHECK-NEXT: mvni.8h v2, #128, lsl #8
+; CHECK-NEXT: bif.16b v0, v1, v2
+; CHECK-NEXT: ret
%tmp0 = fptrunc <8 x float> %b to <8 x half>
%r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)
ret <8 x half> %r
@@ -482,60 +260,8 @@ declare <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) #0
define <4 x bfloat> @test_copysign_v4bf16_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
; CHECK-LABEL: test_copysign_v4bf16_v4bf16:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov h3, v1[1]
-; CHECK-NEXT: mov h4, v0[1]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov h5, v1[2]
-; CHECK-NEXT: mov h6, v0[2]
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-NEXT: mov h1, v1[3]
-; CHECK-NEXT: mov h0, v0[3]
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: lsl w11, w11, #16
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s7, w8
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov s3, w9
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fmov w9, s6
-; CHECK-NEXT: fmov w10, s1
-; CHECK-NEXT: bit.16b v3, v4, v2
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: fmov s4, w11
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: bif.16b v4, v7, v2
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: lsl w11, w11, #16
-; CHECK-NEXT: bif.16b v1, v0, v2
-; CHECK-NEXT: fmov s5, w11
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fmov s3, w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.16b v1, v2
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: bsl.16b v1, v5, v4
-; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov.h v0[1], v3[0]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v0[2], v2[0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov.h v0[3], v1[0]
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %r
@@ -546,66 +272,16 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b)
; CHECK: ; %bb.0:
; CHECK-NEXT: movi.4s v2, #127, msl #8
; CHECK-NEXT: movi.4s v3, #1
-; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ushr.4s v4, v1, #16
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov h5, v0[2]
-; CHECK-NEXT: mov h6, v0[3]
; CHECK-NEXT: add.4s v2, v1, v2
; CHECK-NEXT: and.16b v3, v4, v3
-; CHECK-NEXT: fcmeq.4s v4, v1, v1
-; CHECK-NEXT: orr.4s v1, #64, lsl #16
-; CHECK-NEXT: lsl w9, w9, #16
; CHECK-NEXT: add.4s v2, v3, v2
-; CHECK-NEXT: mov h3, v0[1]
-; CHECK-NEXT: bit.16b v1, v2, v4
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: shrn.4h v2, v1, #16
-; CHECK-NEXT: mvni.4s v1, #128, lsl #24
-; CHECK-NEXT: fmov s3, w8
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: fmov s5, w9
-; CHECK-NEXT: mov h4, v2[1]
-; CHECK-NEXT: mov h0, v2[2]
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: mov h2, v2[3]
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: lsl w11, w11, #16
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fmov w10, s6
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: bif.16b v3, v4, v1
-; CHECK-NEXT: fmov s4, w11
-; CHECK-NEXT: bit.16b v2, v0, v1
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: bit.16b v4, v5, v1
-; CHECK-NEXT: fmov s5, w8
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w11, s4
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s3, w9
-; CHECK-NEXT: lsr w11, w11, #16
-; CHECK-NEXT: bsl.16b v1, v4, v5
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: fmov s0, w11
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.h v0[1], v3[0]
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v0[2], v2[0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov.h v0[3], v1[0]
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: fcmeq.4s v3, v1, v1
+; CHECK-NEXT: orr.4s v1, #64, lsl #16
+; CHECK-NEXT: bit.16b v1, v2, v3
+; CHECK-NEXT: mvni.4h v2, #128, lsl #8
+; CHECK-NEXT: shrn.4h v1, v1, #16
+; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x float> %b to <4 x bfloat>
%r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %tmp0)
@@ -615,51 +291,42 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b)
define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4bf16_v4f64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov h4, v0[1]
-; CHECK-NEXT: mov h5, v0[2]
; CHECK-NEXT: mov d3, v1[1]
-; CHECK-NEXT: fcvt s1, d1
-; CHECK-NEXT: mov h0, v0[3]
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: mvni.4s v4, #128, lsl #24
-; CHECK-NEXT: fmov s6, w8
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: fcvt s3, d3
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: bit.16b v1, v6, v4
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: mov d6, v2[1]
-; CHECK-NEXT: fmov s7, w9
-; CHECK-NEXT: fcvt s2, d2
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s5, w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.16b v1, v4
-; CHECK-NEXT: bit.16b v3, v7, v4
-; CHECK-NEXT: bsl.16b v1, v5, v2
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fcvt s2, d6
+; CHECK-NEXT: fcvtxn s1, d1
+; CHECK-NEXT: mov w8, #32767 ; =0x7fff
+; CHECK-NEXT: fcvtxn s3, d3
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: ubfx w12, w10, #16, #1
+; CHECK-NEXT: add w10, w10, w8
; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: fmov s5, w10
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.16b v1, v4
+; CHECK-NEXT: fcvtxn s3, d2
+; CHECK-NEXT: mov d2, v2[1]
+; CHECK-NEXT: add w10, w12, w10
+; CHECK-NEXT: lsr w10, w10, #16
+; CHECK-NEXT: ubfx w11, w9, #16, #1
+; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: fcvtxn s1, d2
+; CHECK-NEXT: add w9, w11, w9
+; CHECK-NEXT: fmov w11, s3
+; CHECK-NEXT: fmov s3, w10
; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: fmov s3, w9
-; CHECK-NEXT: bsl.16b v1, v5, v2
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v0[1], v3[0]
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov.h v0[2], v2[0]
+; CHECK-NEXT: ubfx w12, w11, #16, #1
+; CHECK-NEXT: fmov s2, w9
+; CHECK-NEXT: add w9, w11, w8
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: add w9, w12, w9
+; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: mov.h v3[1], v2[0]
+; CHECK-NEXT: ubfx w11, w10, #16, #1
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: add w8, w11, w8
; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: mov.h v3[2], v1[0]
; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov.h v0[3], v1[0]
-; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mov.h v3[3], v1[0]
+; CHECK-NEXT: mvni.4h v1, #128, lsl #8
+; CHECK-NEXT: bif.8b v0, v3, v1
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x bfloat>
%r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %tmp0)
@@ -673,111 +340,8 @@ declare <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) #0
define <8 x bfloat> @test_copysign_v8bf16_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_copysign_v8bf16_v8bf16:
; CHECK: ; %bb.0:
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov h2, v1[1]
-; CHECK-NEXT: mov h4, v0[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov h6, v1[2]
-; CHECK-NEXT: mov h7, v0[2]
-; CHECK-NEXT: mvni.4s v3, #128, lsl #24
-; CHECK-NEXT: mov h5, v1[3]
-; CHECK-NEXT: mov h16, v0[3]
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: mov h17, v1[4]
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: mov h4, v0[4]
-; CHECK-NEXT: fmov s18, w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w11, s7
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: lsl w9, w10, #16
-; CHECK-NEXT: fmov w10, s6
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov s7, w9
-; CHECK-NEXT: bif.16b v2, v18, v3
-; CHECK-NEXT: lsl w9, w11, #16
-; CHECK-NEXT: fmov s6, w8
-; CHECK-NEXT: lsl w8, w10, #16
-; CHECK-NEXT: fmov w10, s5
-; CHECK-NEXT: fmov w11, s16
-; CHECK-NEXT: fmov s16, w9
-; CHECK-NEXT: mov h18, v0[5]
-; CHECK-NEXT: fmov s5, w8
-; CHECK-NEXT: bit.16b v6, v7, v3
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: lsl w9, w10, #16
-; CHECK-NEXT: lsl w10, w11, #16
-; CHECK-NEXT: mov h7, v1[5]
-; CHECK-NEXT: bit.16b v5, v16, v3
-; CHECK-NEXT: fmov s16, w10
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: mov.16b v4, v3
-; CHECK-NEXT: fmov w11, s6
-; CHECK-NEXT: fmov s6, w9
-; CHECK-NEXT: fmov w9, s17
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: lsr w11, w11, #16
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: lsl w8, w9, #16
-; CHECK-NEXT: bsl.16b v4, v16, v6
-; CHECK-NEXT: lsl w9, w10, #16
-; CHECK-NEXT: fmov w10, s5
-; CHECK-NEXT: fmov s6, w11
-; CHECK-NEXT: fmov s5, w8
-; CHECK-NEXT: lsr w8, w10, #16
-; CHECK-NEXT: fmov w10, s7
-; CHECK-NEXT: mov.h v2[1], v6[0]
-; CHECK-NEXT: fmov s6, w9
-; CHECK-NEXT: fmov w9, s18
-; CHECK-NEXT: fmov s7, w8
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: mov h4, v1[6]
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: mov h1, v1[7]
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: bit.16b v5, v6, v3
-; CHECK-NEXT: mov h6, v0[6]
-; CHECK-NEXT: mov.h v2[2], v7[0]
-; CHECK-NEXT: fmov s7, w10
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s16, w9
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: mov h0, v0[7]
-; CHECK-NEXT: fmov w10, s6
-; CHECK-NEXT: bit.16b v7, v16, v3
-; CHECK-NEXT: fmov s16, w8
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v2[3], v16[0]
-; CHECK-NEXT: fmov s5, w9
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: fmov s4, w8
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: mov.h v2[4], v4[0]
-; CHECK-NEXT: fmov s4, w10
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: bif.16b v4, v5, v3
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s5, w10
-; CHECK-NEXT: mov.h v2[5], v0[0]
-; CHECK-NEXT: mov.16b v0, v3
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: bsl.16b v0, v5, v1
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.h v2[6], v1[0]
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov.h v2[7], v0[0]
-; CHECK-NEXT: mov.16b v0, v2
+; CHECK-NEXT: mvni.8h v2, #128, lsl #8
+; CHECK-NEXT: bif.16b v0, v1, v2
; CHECK-NEXT: ret
%r = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %r
@@ -788,126 +352,23 @@ define <8 x bfloat> @test_copysign_v8bf16_v8f32(<8 x bfloat> %a, <8 x float> %b)
; CHECK: ; %bb.0:
; CHECK-NEXT: movi.4s v3, #127, msl #8
; CHECK-NEXT: movi.4s v4, #1
-; CHECK-NEXT: ushr.4s v5, v1, #16
-; CHECK-NEXT: fcmeq.4s v7, v1, v1
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: add.4s v6, v1, v3
+; CHECK-NEXT: ushr.4s v5, v2, #16
+; CHECK-NEXT: ushr.4s v6, v1, #16
+; CHECK-NEXT: add.4s v7, v2, v3
+; CHECK-NEXT: add.4s v3, v1, v3
; CHECK-NEXT: and.16b v5, v5, v4
-; CHECK-NEXT: orr.4s v1, #64, lsl #16
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: add.4s v5, v5, v6
-; CHECK-NEXT: ushr.4s v6, v2, #16
; CHECK-NEXT: and.16b v4, v6, v4
-; CHECK-NEXT: mov h6, v0[2]
-; CHECK-NEXT: bit.16b v1, v5, v7
-; CHECK-NEXT: add.4s v7, v2, v3
-; CHECK-NEXT: mov h5, v0[1]
-; CHECK-NEXT: fcmeq.4s v3, v2, v2
+; CHECK-NEXT: fcmeq.4s v6, v2, v2
; CHECK-NEXT: orr.4s v2, #64, lsl #16
-; CHECK-NEXT: shrn.4h v1, v1, #16
-; CHECK-NEXT: add.4s v4, v4, v7
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov h7, v0[3]
-; CHECK-NEXT: mov h5, v0[4]
-; CHECK-NEXT: mov h16, v1[1]
-; CHECK-NEXT: fmov w10, s1
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: bsl.16b v3, v4, v2
-; CHECK-NEXT: mov h4, v1[2]
-; CHECK-NEXT: mov h17, v1[3]
-; CHECK-NEXT: mvni.4s v2, #128, lsl #24
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: fmov w9, s6
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s6, w8
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: fmov w11, s16
-; CHECK-NEXT: fmov s7, w10
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: mov.16b v4, v2
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: shrn.4h v3, v3, #16
-; CHECK-NEXT: lsl w11, w11, #16
-; CHECK-NEXT: bif.16b v1, v7, v2
-; CHECK-NEXT: fmov s16, w8
-; CHECK-NEXT: fmov s7, w11
-; CHECK-NEXT: bsl.16b v4, v6, v7
-; CHECK-NEXT: fmov s7, w9
-; CHECK-NEXT: lsl w9, w10, #16
-; CHECK-NEXT: fmov w10, s17
-; CHECK-NEXT: mov h6, v0[5]
-; CHECK-NEXT: lsl w8, w10, #16
-; CHECK-NEXT: fmov w10, s1
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: lsr w9, w10, #16
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s4, w8
-; CHECK-NEXT: bif.16b v7, v1, v2
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: mov h5, v3[1]
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: fmov w9, s3
-; CHECK-NEXT: lsr w10, w10, #16
-; CHECK-NEXT: bit.16b v4, v16, v2
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: fmov s16, w10
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: fmov w10, s7
-; CHECK-NEXT: mov h7, v0[6]
-; CHECK-NEXT: mov h0, v0[7]
-; CHECK-NEXT: mov.h v1[1], v16[0]
-; CHECK-NEXT: fmov s16, w8
-; CHECK-NEXT: fmov w8, s6
-; CHECK-NEXT: fmov s6, w9
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: lsr w10, w10, #16
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: bit.16b v6, v16, v2
-; CHECK-NEXT: fmov s16, w10
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s5, w8
-; CHECK-NEXT: fmov w8, s7
-; CHECK-NEXT: fmov s7, w9
-; CHECK-NEXT: mov h4, v3[2]
-; CHECK-NEXT: mov h3, v3[3]
-; CHECK-NEXT: mov.h v1[2], v16[0]
-; CHECK-NEXT: lsr w10, w10, #16
-; CHECK-NEXT: fmov w9, s6
-; CHECK-NEXT: lsl w8, w8, #16
-; CHECK-NEXT: bif.16b v5, v7, v2
-; CHECK-NEXT: fmov s16, w10
-; CHECK-NEXT: fmov w10, s4
-; CHECK-NEXT: fmov s4, w8
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: mov.h v1[3], v16[0]
-; CHECK-NEXT: fmov w8, s5
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s6, w9
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov s5, w10
-; CHECK-NEXT: fmov w10, s3
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mov.h v1[4], v6[0]
-; CHECK-NEXT: lsl w9, w9, #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: bif.16b v4, v5, v2
-; CHECK-NEXT: lsl w10, w10, #16
-; CHECK-NEXT: fmov s3, w9
-; CHECK-NEXT: fmov s5, w10
-; CHECK-NEXT: mov.h v1[5], v0[0]
-; CHECK-NEXT: mov.16b v0, v2
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: bsl.16b v0, v3, v5
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov.h v1[6], v2[0]
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov.h v1[7], v0[0]
-; CHECK-NEXT: mov.16b v0, v1
+; CHECK-NEXT: add.4s v5, v5, v7
+; CHECK-NEXT: add.4s v3, v4, v3
+; CHECK-NEXT: fcmeq.4s v4, v1, v1
+; CHECK-NEXT: orr.4s v1, #64, lsl #16
+; CHECK-NEXT: bit.16b v2, v5, v6
+; CHECK-NEXT: bit.16b v1, v3, v4
+; CHECK-NEXT: uzp2.8h v1, v1, v2
+; CHECK-NEXT: mvni.8h v2, #128, lsl #8
+; CHECK-NEXT: bif.16b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <8 x float> %b to <8 x bfloat>
%r = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %tmp0)
@@ -917,3 +378,6 @@ define <8 x bfloat> @test_copysign_v8bf16_v8f32(<8 x bfloat> %a, <8 x float> %b)
declare <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) #0
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FP16: {{.*}}
+; NOFP16: {{.*}}
More information about the llvm-commits
mailing list