[llvm] r283119 - [x86, SSE/AVX] allow 128/256-bit lowering for copysign vector intrinsics (PR30433)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 3 09:38:27 PDT 2016
Author: spatel
Date: Mon Oct 3 11:38:27 2016
New Revision: 283119
URL: http://llvm.org/viewvc/llvm-project?rev=283119&view=rev
Log:
[x86, SSE/AVX] allow 128/256-bit lowering for copysign vector intrinsics (PR30433)
This should fix:
https://llvm.org/bugs/show_bug.cgi?id=30433
There are a couple of open questions about the codegen:
1. Should we let scalar ops be scalars and avoid vector constant loads/splats?
2. Should we have a pass to combine constants such as the inverted pair that we have here?
Differential Revision: https://reviews.llvm.org/D25165
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
llvm/trunk/test/CodeGen/X86/vec-copysign.ll
llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=283119&r1=283118&r2=283119&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Oct 3 11:38:27 2016
@@ -730,6 +730,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
@@ -765,6 +766,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
@@ -980,6 +982,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
@@ -14662,31 +14665,39 @@ static SDValue LowerFCOPYSIGN(SDValue Op
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
bool IsF128 = (VT == MVT::f128);
- assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+ assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+ VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32) &&
"Unexpected type in LowerFCOPYSIGN");
+ MVT EltVT = VT.getScalarType();
const fltSemantics &Sem =
- VT == MVT::f64 ? APFloat::IEEEdouble :
- (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
- const unsigned SizeInBits = VT.getSizeInBits();
+ EltVT == MVT::f64 ? APFloat::IEEEdouble
+ : (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
- // Perform all logic operations as 16-byte vectors because there are no
+ // Perform all scalar logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE.
- MVT LogicVT =
- (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+ // TODO: This isn't necessary. If we used scalar types, we might avoid some
+ // unnecessary splats, but we might miss load folding opportunities. Should
+ // this decision be based on OptimizeForSize?
+ bool IsFakeVector = !VT.isVector() && !IsF128;
+ MVT LogicVT = VT;
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+ // The mask constants are automatically splatted for vector types.
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue SignMask = DAG.getConstantFP(
- APFloat(Sem, APInt::getSignBit(SizeInBits)), dl, LogicVT);
+ APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+ SDValue MagMask = DAG.getConstantFP(
+ APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
// First, clear all bits but the sign bit from the second operand (sign).
- if (!IsF128)
+ if (IsFakeVector)
Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
// Next, clear the sign bit from the first operand (magnitude).
- // If it's a constant, we can clear it here.
- SDValue MagMask = DAG.getConstantFP(
- APFloat(Sem, ~APInt::getSignBit(SizeInBits)), dl, LogicVT);
-
// TODO: If we had general constant folding for FP logic ops, this check
// wouldn't be necessary.
SDValue MagBits;
@@ -14696,16 +14707,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op
MagBits = DAG.getConstantFP(APF, dl, LogicVT);
} else {
// If the magnitude operand wasn't a constant, we need to AND out the sign.
- if (!IsF128)
+ if (IsFakeVector)
Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
}
// OR the magnitude value with the sign bit.
SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
- return IsF128 ? Or :
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
- DAG.getIntPtrConstant(0, dl));
+ return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
Modified: llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll?rev=283119&r1=283118&r2=283119&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll Mon Oct 3 11:38:27 2016
@@ -401,22 +401,22 @@ define i32 @fcopysign(i32 %arg) {
; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
%F32 = call float @llvm.copysign.f32(float undef, float undef)
- ; SSE2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
- ; SSE42: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
- ; AVX: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
- ; AVX2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
- ; AVX512: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+ ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+ ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+ ; AVX: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+ ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+ ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
%V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
- ; SSE2: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
- ; SSE42: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
- ; AVX: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
- ; AVX2: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
- ; AVX512: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+ ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+ ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+ ; AVX: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+ ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+ ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
%V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
- ; SSE2: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
- ; SSE42: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
- ; AVX: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
- ; AVX2: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+ ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+ ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+ ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+ ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX512: cost of 77 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
%V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
@@ -426,22 +426,22 @@ define i32 @fcopysign(i32 %arg) {
; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
%F64 = call double @llvm.copysign.f64(double undef, double undef)
- ; SSE2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
- ; SSE42: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
- ; AVX: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
- ; AVX2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
- ; AVX512: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+ ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+ ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+ ; AVX: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+ ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+ ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
%V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
- ; SSE2: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
- ; SSE42: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
- ; AVX: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
- ; AVX2: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
- ; AVX512: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+ ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+ ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+ ; AVX: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+ ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+ ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
%V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
- ; SSE2: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
- ; SSE42: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
- ; AVX: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
- ; AVX2: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+ ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+ ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+ ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+ ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX512: cost of 37 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
%V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
Modified: llvm/trunk/test/CodeGen/X86/vec-copysign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec-copysign.ll?rev=283119&r1=283118&r2=283119&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec-copysign.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec-copysign.ll Mon Oct 3 11:38:27 2016
@@ -1,286 +1,161 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=CHECK
-; FIXME: These don't have to be scalarized.
+; Assertions have been enhanced from utils/update_test_checks.py to show the constant pool values.
+; Use a macosx triple to make sure the format of those constant strings is exact.
+
+; CHECK: [[SIGNMASK1:L.+]]:
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 2147483648
+
+; CHECK: [[MAGMASK1:L.+]]:
+; CHECK-NEXT: .long 2147483647
+; CHECK-NEXT: .long 2147483647
+; CHECK-NEXT: .long 2147483647
+; CHECK-NEXT: .long 2147483647
define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
; SSE2-LABEL: v4f32:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
-; SSE2-NEXT: andps %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm5
-; SSE2-NEXT: andps %xmm5, %xmm4
-; SSE2-NEXT: orps %xmm2, %xmm4
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE2-NEXT: andps %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm0, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1,2,3]
-; SSE2-NEXT: andps %xmm5, %xmm6
-; SSE2-NEXT: orps %xmm2, %xmm6
-; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: andps %xmm5, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: andps %xmm3, %xmm1
-; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: andps [[SIGNMASK1]](%rip), %xmm1
+; SSE2-NEXT: andps [[MAGMASK1]](%rip), %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: v4f32:
; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
-; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vmovaps {{.*#+}} xmm4
-; AVX-NEXT: vandps %xmm4, %xmm0, %xmm5
-; AVX-NEXT: vorps %xmm3, %xmm5, %xmm3
-; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX-NEXT: vandps %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX-NEXT: vandps %xmm4, %xmm6, %xmm6
-; AVX-NEXT: vorps %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-NEXT: vandpd %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX-NEXT: vandpd %xmm4, %xmm6, %xmm6
-; AVX-NEXT: vorpd %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vandps %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vandps [[SIGNMASK1]](%rip), %xmm1, %xmm1
+; AVX-NEXT: vandps [[MAGMASK1]](%rip), %xmm0, %xmm0
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
; AVX-NEXT: retq
;
%tmp = tail call <4 x float> @llvm.copysign.v4f32( <4 x float> %a, <4 x float> %b )
ret <4 x float> %tmp
}
+; SSE2: [[SIGNMASK2:L.+]]:
+; SSE2-NEXT: .long 2147483648
+; SSE2-NEXT: .long 2147483648
+; SSE2-NEXT: .long 2147483648
+; SSE2-NEXT: .long 2147483648
+
+; SSE2: [[MAGMASK2:L.+]]:
+; SSE2-NEXT: .long 2147483647
+; SSE2-NEXT: .long 2147483647
+; SSE2-NEXT: .long 2147483647
+; SSE2-NEXT: .long 2147483647
+
+; AVX: [[SIGNMASK2:L.+]]:
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+; AVX-NEXT: .long 2147483648
+
+; AVX: [[MAGMASK2:L.+]]:
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+; AVX-NEXT: .long 2147483647
+
define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
; SSE2-LABEL: v8f32:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, %xmm5
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm8 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
-; SSE2-NEXT: andps %xmm8, %xmm0
-; SSE2-NEXT: movaps %xmm5, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm6
-; SSE2-NEXT: andps %xmm6, %xmm7
-; SSE2-NEXT: orps %xmm0, %xmm7
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: andps %xmm8, %xmm0
-; SSE2-NEXT: movaps %xmm5, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; SSE2-NEXT: andps %xmm6, %xmm4
-; SSE2-NEXT: orps %xmm0, %xmm4
-; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; SSE2-NEXT: movaps %xmm2, %xmm7
-; SSE2-NEXT: andps %xmm8, %xmm7
-; SSE2-NEXT: movaps %xmm5, %xmm0
-; SSE2-NEXT: andps %xmm6, %xmm0
-; SSE2-NEXT: orps %xmm7, %xmm0
-; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; SSE2-NEXT: andps %xmm8, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1]
-; SSE2-NEXT: andps %xmm6, %xmm5
-; SSE2-NEXT: orps %xmm2, %xmm5
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; SSE2-NEXT: andps %xmm8, %xmm2
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; SSE2-NEXT: andps %xmm6, %xmm4
-; SSE2-NEXT: orps %xmm2, %xmm4
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE2-NEXT: andps %xmm8, %xmm2
-; SSE2-NEXT: movaps %xmm1, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; SSE2-NEXT: andps %xmm6, %xmm5
-; SSE2-NEXT: orps %xmm2, %xmm5
-; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE2-NEXT: movaps %xmm3, %xmm4
-; SSE2-NEXT: andps %xmm8, %xmm4
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: andps %xmm6, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
-; SSE2-NEXT: andps %xmm8, %xmm3
-; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: andps %xmm6, %xmm1
+; SSE2-NEXT: movaps [[SIGNMASK2]](%rip), %xmm4
+; SSE2-NEXT: andps %xmm4, %xmm2
+; SSE2-NEXT: movaps [[MAGMASK2]](%rip), %xmm5
+; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm3
+; SSE2-NEXT: andps %xmm5, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: retq
;
; AVX-LABEL: v8f32:
; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
-; AVX-NEXT: vandps %xmm2, %xmm4, %xmm5
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX-NEXT: vmovaps {{.*#+}} xmm3
-; AVX-NEXT: vandps %xmm3, %xmm6, %xmm7
-; AVX-NEXT: vorps %xmm5, %xmm7, %xmm8
-; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm4[1,1,3,3]
-; AVX-NEXT: vandps %xmm2, %xmm7, %xmm7
-; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; AVX-NEXT: vandps %xmm3, %xmm5, %xmm5
-; AVX-NEXT: vorps %xmm7, %xmm5, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[2,3]
-; AVX-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX-NEXT: vandpd %xmm2, %xmm7, %xmm7
-; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX-NEXT: vandpd %xmm3, %xmm5, %xmm5
-; AVX-NEXT: vorpd %xmm7, %xmm5, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm8[0,1],xmm5[0],xmm8[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX-NEXT: vandps %xmm2, %xmm4, %xmm4
-; AVX-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX-NEXT: vandps %xmm3, %xmm6, %xmm6
-; AVX-NEXT: vorps %xmm4, %xmm6, %xmm4
-; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX-NEXT: vandps %xmm2, %xmm1, %xmm5
-; AVX-NEXT: vandps %xmm3, %xmm0, %xmm6
-; AVX-NEXT: vorps %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; AVX-NEXT: vandps %xmm2, %xmm6, %xmm6
-; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; AVX-NEXT: vandps %xmm3, %xmm7, %xmm7
-; AVX-NEXT: vorps %xmm6, %xmm7, %xmm6
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
-; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX-NEXT: vandpd %xmm2, %xmm6, %xmm6
-; AVX-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX-NEXT: vandpd %xmm3, %xmm7, %xmm7
-; AVX-NEXT: vorpd %xmm6, %xmm7, %xmm6
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT: vandps [[SIGNMASK2]](%rip), %ymm1, %ymm1
+; AVX-NEXT: vandps [[MAGMASK2]](%rip), %ymm0, %ymm0
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
%tmp = tail call <8 x float> @llvm.copysign.v8f32( <8 x float> %a, <8 x float> %b )
ret <8 x float> %tmp
}
+; CHECK: [[SIGNMASK3:L.+]]:
+; CHECK-NEXT: .quad -9223372036854775808
+; CHECK-NEXT: .quad -9223372036854775808
+
+; CHECK: [[MAGMASK3:L.+]]:
+; CHECK-NEXT: .quad 9223372036854775807
+; CHECK-NEXT: .quad 9223372036854775807
+
define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind {
; SSE2-LABEL: v2f64:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: movaps {{.*#+}} xmm5
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: andps %xmm5, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: andps %xmm3, %xmm1
-; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: andps [[SIGNMASK3]](%rip), %xmm1
+; SSE2-NEXT: andps [[MAGMASK3]](%rip), %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00]
-; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vmovapd {{.*#+}} xmm4
-; AVX-NEXT: vandpd %xmm4, %xmm0, %xmm5
-; AVX-NEXT: vorpd %xmm3, %xmm5, %xmm3
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vandpd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX-NEXT: vandps [[SIGNMASK3]](%rip), %xmm1, %xmm1
+; AVX-NEXT: vandps [[MAGMASK3]](%rip), %xmm0, %xmm0
+; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
%tmp = tail call <2 x double> @llvm.copysign.v2f64( <2 x double> %a, <2 x double> %b )
ret <2 x double> %tmp
}
+; SSE2: [[SIGNMASK4:L.+]]:
+; SSE2-NEXT: .quad -9223372036854775808
+; SSE2-NEXT: .quad -9223372036854775808
+
+; SSE2: [[MAGMASK4:L.+]]:
+; SSE2-NEXT: .quad 9223372036854775807
+; SSE2-NEXT: .quad 9223372036854775807
+
+; AVX: [[SIGNMASK4:L.+]]:
+; AVX-NEXT: .quad -9223372036854775808
+; AVX-NEXT: .quad -9223372036854775808
+; AVX-NEXT: .quad -9223372036854775808
+; AVX-NEXT: .quad -9223372036854775808
+
+; AVX: [[MAGMASK4:L.+]]:
+; AVX-NEXT: .quad 9223372036854775807
+; AVX-NEXT: .quad 9223372036854775807
+; AVX-NEXT: .quad 9223372036854775807
+; AVX-NEXT: .quad 9223372036854775807
+
define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind {
; SSE2-LABEL: v4f64:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: movaps {{.*#+}} xmm5 = [-0.000000e+00,-0.000000e+00]
-; SSE2-NEXT: movaps %xmm2, %xmm6
-; SSE2-NEXT: andps %xmm5, %xmm6
-; SSE2-NEXT: movaps {{.*#+}} xmm7
-; SSE2-NEXT: andps %xmm7, %xmm0
-; SSE2-NEXT: orps %xmm6, %xmm0
-; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; SSE2-NEXT: andps %xmm5, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
-; SSE2-NEXT: andps %xmm7, %xmm4
-; SSE2-NEXT: orps %xmm2, %xmm4
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT: movaps %xmm3, %xmm4
-; SSE2-NEXT: andps %xmm5, %xmm4
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: andps %xmm7, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
-; SSE2-NEXT: andps %xmm5, %xmm3
-; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: andps %xmm7, %xmm1
+; SSE2-NEXT: movaps [[SIGNMASK4]](%rip), %xmm4
+; SSE2-NEXT: andps %xmm4, %xmm2
+; SSE2-NEXT: movaps [[MAGMASK4]](%rip), %xmm5
+; SSE2-NEXT: andps %xmm5, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm3
+; SSE2-NEXT: andps %xmm5, %xmm1
; SSE2-NEXT: orps %xmm3, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: movapd %xmm2, %xmm1
; SSE2-NEXT: retq
;
; AVX-LABEL: v4f64:
; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00]
-; AVX-NEXT: vandpd %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX-NEXT: vmovapd {{.*#+}} xmm6
-; AVX-NEXT: vandpd %xmm6, %xmm5, %xmm7
-; AVX-NEXT: vorpd %xmm4, %xmm7, %xmm4
-; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX-NEXT: vandpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX-NEXT: vandpd %xmm6, %xmm5, %xmm5
-; AVX-NEXT: vorpd %xmm2, %xmm5, %xmm2
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm4
-; AVX-NEXT: vandpd %xmm6, %xmm0, %xmm5
-; AVX-NEXT: vorpd %xmm4, %xmm5, %xmm4
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vandpd %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps [[SIGNMASK4]](%rip), %ymm1, %ymm1
+; AVX-NEXT: vandps [[MAGMASK4]](%rip), %ymm0, %ymm0
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
%tmp = tail call <4 x double> @llvm.copysign.v4f64( <4 x double> %a, <4 x double> %b )
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll?rev=283119&r1=283118&r2=283119&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll Mon Oct 3 11:38:27 2016
@@ -25,14 +25,10 @@ declare double @llvm.copysign.f64(double
define void @fcopysign_2f64() #0 {
; CHECK-LABEL: @fcopysign_2f64(
-; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; CHECK-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; CHECK-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call double @llvm.copysign.f64(double [[A0]], double [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call double @llvm.copysign.f64(double [[A1]], double [[B1]])
-; CHECK-NEXT: store double [[FCOPYSIGN0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT: store double [[FCOPYSIGN1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
; CHECK-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
@@ -47,24 +43,23 @@ define void @fcopysign_2f64() #0 {
}
define void @fcopysign_4f64() #0 {
-; CHECK-LABEL: @fcopysign_4f64(
-; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
-; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
-; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
-; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
-; CHECK-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
-; CHECK-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
-; CHECK-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
-; CHECK-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call double @llvm.copysign.f64(double [[A0]], double [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call double @llvm.copysign.f64(double [[A1]], double [[B1]])
-; CHECK-NEXT: [[FCOPYSIGN2:%.*]] = call double @llvm.copysign.f64(double [[A2]], double [[B2]])
-; CHECK-NEXT: [[FCOPYSIGN3:%.*]] = call double @llvm.copysign.f64(double [[A3]], double [[B3]])
-; CHECK-NEXT: store double [[FCOPYSIGN0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
-; CHECK-NEXT: store double [[FCOPYSIGN1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; CHECK-NEXT: store double [[FCOPYSIGN2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
-; CHECK-NEXT: store double [[FCOPYSIGN3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; CHECK-NEXT: ret void
+; SSE-LABEL: @fcopysign_4f64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
+; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @fcopysign_4f64(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
+; AVX-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
@@ -86,40 +81,35 @@ define void @fcopysign_4f64() #0 {
}
define void @fcopysign_8f64() #0 {
-; CHECK-LABEL: @fcopysign_8f64(
-; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
-; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
-; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
-; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
-; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
-; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
-; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
-; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
-; CHECK-NEXT: [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
-; CHECK-NEXT: [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
-; CHECK-NEXT: [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
-; CHECK-NEXT: [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
-; CHECK-NEXT: [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
-; CHECK-NEXT: [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
-; CHECK-NEXT: [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
-; CHECK-NEXT: [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call double @llvm.copysign.f64(double [[A0]], double [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call double @llvm.copysign.f64(double [[A1]], double [[B1]])
-; CHECK-NEXT: [[FCOPYSIGN2:%.*]] = call double @llvm.copysign.f64(double [[A2]], double [[B2]])
-; CHECK-NEXT: [[FCOPYSIGN3:%.*]] = call double @llvm.copysign.f64(double [[A3]], double [[B3]])
-; CHECK-NEXT: [[FCOPYSIGN4:%.*]] = call double @llvm.copysign.f64(double [[A4]], double [[B4]])
-; CHECK-NEXT: [[FCOPYSIGN5:%.*]] = call double @llvm.copysign.f64(double [[A5]], double [[B5]])
-; CHECK-NEXT: [[FCOPYSIGN6:%.*]] = call double @llvm.copysign.f64(double [[A6]], double [[B6]])
-; CHECK-NEXT: [[FCOPYSIGN7:%.*]] = call double @llvm.copysign.f64(double [[A7]], double [[B7]])
-; CHECK-NEXT: store double [[FCOPYSIGN0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
-; CHECK-NEXT: store double [[FCOPYSIGN7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
-; CHECK-NEXT: ret void
+; SSE-LABEL: @fcopysign_8f64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
+; SSE-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
+; SSE-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
+; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
+; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @fcopysign_8f64(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
+; AVX-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
+; AVX-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
@@ -158,22 +148,10 @@ define void @fcopysign_8f64() #0 {
define void @fcopysign_4f32() #0 {
; CHECK-LABEL: @fcopysign_4f32(
-; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call float @llvm.copysign.f32(float [[A0]], float [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call float @llvm.copysign.f32(float [[A1]], float [[B1]])
-; CHECK-NEXT: [[FCOPYSIGN2:%.*]] = call float @llvm.copysign.f32(float [[A2]], float [[B2]])
-; CHECK-NEXT: [[FCOPYSIGN3:%.*]] = call float @llvm.copysign.f32(float [[A3]], float [[B3]])
-; CHECK-NEXT: store float [[FCOPYSIGN0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
; CHECK-NEXT: ret void
;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
@@ -196,40 +174,23 @@ define void @fcopysign_4f32() #0 {
}
define void @fcopysign_8f32() #0 {
-; CHECK-LABEL: @fcopysign_8f32(
-; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; CHECK-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; CHECK-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; CHECK-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; CHECK-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call float @llvm.copysign.f32(float [[A0]], float [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call float @llvm.copysign.f32(float [[A1]], float [[B1]])
-; CHECK-NEXT: [[FCOPYSIGN2:%.*]] = call float @llvm.copysign.f32(float [[A2]], float [[B2]])
-; CHECK-NEXT: [[FCOPYSIGN3:%.*]] = call float @llvm.copysign.f32(float [[A3]], float [[B3]])
-; CHECK-NEXT: [[FCOPYSIGN4:%.*]] = call float @llvm.copysign.f32(float [[A4]], float [[B4]])
-; CHECK-NEXT: [[FCOPYSIGN5:%.*]] = call float @llvm.copysign.f32(float [[A5]], float [[B5]])
-; CHECK-NEXT: [[FCOPYSIGN6:%.*]] = call float @llvm.copysign.f32(float [[A6]], float [[B6]])
-; CHECK-NEXT: [[FCOPYSIGN7:%.*]] = call float @llvm.copysign.f32(float [[A7]], float [[B7]])
-; CHECK-NEXT: store float [[FCOPYSIGN0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; CHECK-NEXT: ret void
+; SSE-LABEL: @fcopysign_8f32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
+; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @fcopysign_8f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT: ret void
;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
@@ -267,72 +228,35 @@ define void @fcopysign_8f32() #0 {
}
define void @fcopysign_16f32() #0 {
-; CHECK-LABEL: @fcopysign_16f32(
-; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
-; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
-; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
-; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
-; CHECK-NEXT: [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4
-; CHECK-NEXT: [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4
-; CHECK-NEXT: [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
-; CHECK-NEXT: [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
-; CHECK-NEXT: [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
-; CHECK-NEXT: [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
-; CHECK-NEXT: [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
-; CHECK-NEXT: [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
-; CHECK-NEXT: [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
-; CHECK-NEXT: [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
-; CHECK-NEXT: [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
-; CHECK-NEXT: [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
-; CHECK-NEXT: [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
-; CHECK-NEXT: [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
-; CHECK-NEXT: [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
-; CHECK-NEXT: [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
-; CHECK-NEXT: [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4
-; CHECK-NEXT: [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4
-; CHECK-NEXT: [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
-; CHECK-NEXT: [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
-; CHECK-NEXT: [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
-; CHECK-NEXT: [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
-; CHECK-NEXT: [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
-; CHECK-NEXT: [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
-; CHECK-NEXT: [[FCOPYSIGN0:%.*]] = call float @llvm.copysign.f32(float [[A0]], float [[B0]])
-; CHECK-NEXT: [[FCOPYSIGN1:%.*]] = call float @llvm.copysign.f32(float [[A1]], float [[B1]])
-; CHECK-NEXT: [[FCOPYSIGN2:%.*]] = call float @llvm.copysign.f32(float [[A2]], float [[B2]])
-; CHECK-NEXT: [[FCOPYSIGN3:%.*]] = call float @llvm.copysign.f32(float [[A3]], float [[B3]])
-; CHECK-NEXT: [[FCOPYSIGN4:%.*]] = call float @llvm.copysign.f32(float [[A4]], float [[B4]])
-; CHECK-NEXT: [[FCOPYSIGN5:%.*]] = call float @llvm.copysign.f32(float [[A5]], float [[B5]])
-; CHECK-NEXT: [[FCOPYSIGN6:%.*]] = call float @llvm.copysign.f32(float [[A6]], float [[B6]])
-; CHECK-NEXT: [[FCOPYSIGN7:%.*]] = call float @llvm.copysign.f32(float [[A7]], float [[B7]])
-; CHECK-NEXT: [[FCOPYSIGN8:%.*]] = call float @llvm.copysign.f32(float [[A8]], float [[B8]])
-; CHECK-NEXT: [[FCOPYSIGN9:%.*]] = call float @llvm.copysign.f32(float [[A9]], float [[B9]])
-; CHECK-NEXT: [[FCOPYSIGN10:%.*]] = call float @llvm.copysign.f32(float [[A10]], float [[B10]])
-; CHECK-NEXT: [[FCOPYSIGN11:%.*]] = call float @llvm.copysign.f32(float [[A11]], float [[B11]])
-; CHECK-NEXT: [[FCOPYSIGN12:%.*]] = call float @llvm.copysign.f32(float [[A12]], float [[B12]])
-; CHECK-NEXT: [[FCOPYSIGN13:%.*]] = call float @llvm.copysign.f32(float [[A13]], float [[B13]])
-; CHECK-NEXT: [[FCOPYSIGN14:%.*]] = call float @llvm.copysign.f32(float [[A14]], float [[B14]])
-; CHECK-NEXT: [[FCOPYSIGN15:%.*]] = call float @llvm.copysign.f32(float [[A15]], float [[B15]])
-; CHECK-NEXT: store float [[FCOPYSIGN0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
-; CHECK-NEXT: store float [[FCOPYSIGN15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
-; CHECK-NEXT: ret void
+; SSE-LABEL: @fcopysign_16f32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
+; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
+; SSE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
+; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
+; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT: store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @fcopysign_16f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
+; AVX-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX-NEXT: ret void
;
%a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
%a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
More information about the llvm-commits
mailing list