[llvm] 931e956 - [llvm][clang][fpenv] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
Melanie Blower via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 28 09:27:08 PDT 2021
Author: Melanie Blower
Date: 2021-06-28T12:26:52-04:00
New Revision: 931e95687d6df71aa8a33376fd2d566c8153be24
URL: https://github.com/llvm/llvm-project/commit/931e95687d6df71aa8a33376fd2d566c8153be24
DIFF: https://github.com/llvm/llvm-project/commit/931e95687d6df71aa8a33376fd2d566c8153be24.diff
LOG: [llvm][clang][fpenv] Create new intrinsic llvm.arith.fence to control FP optimization at expression level
This intrinsic blocks floating point transformations by the optimizer.
Author: Pengfei
Reviewed By: LuoYuanke, Andy Kaylor, Craig Topper, kpn
Differential Revision: https://reviews.llvm.org/D99675
Added:
llvm/test/CodeGen/X86/arithmetic_fence.ll
llvm/test/CodeGen/X86/arithmetic_fence2.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/include/llvm/CodeGen/SelectionDAGISel.h
llvm/include/llvm/IR/IRBuilder.h
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/Support/TargetOpcodes.def
llvm/include/llvm/Target/Target.td
llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index fc9bf536a9fd..1986f232cc3e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21453,6 +21453,42 @@ If the function's return value's second element is false, the value of the
first element is undefined.
+'``llvm.arithmetic.fence``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare <type>
+ @llvm.arithmetic.fence(<type> <op>)
+
+Overview:
+"""""""""
+
+The purpose of the ``llvm.arithmetic.fence`` intrinsic
+is to prevent the optimizer from performaing fast-math optimizations,
+particularly reassociation,
+between the argument and the expression that contains the argument.
+It can be used to preserve the parentheses in the source language.
+
+Arguments:
+""""""""""
+
+The ``llvm.arithmetic.fence`` intrinsic takes only one argument.
+The argument and the return value are floating-point numbers,
+or vector floating-point numbers, of the same type.
+
+Semantics:
+""""""""""
+
+This intrinsic returns the value of its operand. The optimizer can optimize
+the argument, but the optimizer cannot hoist any component of the operand
+to the containing context, and the optimizer cannot move the calculation of
+any expression in the containing context into the operand.
+
+
'``llvm.donothing``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2e9e27fcb86e..657e8d81aa73 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -576,6 +576,7 @@ class TargetTransformInfoImplBase {
case Intrinsic::assume:
case Intrinsic::sideeffect:
case Intrinsic::pseudoprobe:
+ case Intrinsic::arithmetic_fence:
case Intrinsic::dbg_declare:
case Intrinsic::dbg_value:
case Intrinsic::dbg_label:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c6d92ad7f99d..85486474846d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1609,6 +1609,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::lifetime_end:
case Intrinsic::sideeffect:
case Intrinsic::pseudoprobe:
+ case Intrinsic::arithmetic_fence:
return 0;
case Intrinsic::masked_store: {
Type *Ty = Tys[0];
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6eb70ab47708..8ff83043e705 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1097,6 +1097,10 @@ enum NodeType {
/// specifier.
PREFETCH,
+ /// ARITH_FENCE - This corresponds to a arithmetic fence intrinsic. Both its
+ /// operand and output are the same floating type.
+ ARITH_FENCE,
+
/// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
/// This corresponds to the fence instruction. It takes an input chain, and
/// two integer constants: an AtomicOrdering and a SynchronizationScope.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index f6afa5eedc8d..94ba6ad91517 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -318,6 +318,7 @@ class SelectionDAGISel : public MachineFunctionPass {
void CannotYetSelect(SDNode *N);
void Select_FREEZE(SDNode *N);
+ void Select_ARITH_FENCE(SDNode *N);
private:
void DoInstructionSelection();
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 7fb504ad0d66..aa7c90f932f8 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -905,6 +905,13 @@ class IRBuilderBase {
return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
}
+ /// Create a call to the arithmetic_fence intrinsic.
+ CallInst *CreateArithmeticFence(Value *Val, Type *DstType,
+ const Twine &Name = "") {
+ return CreateIntrinsic(Intrinsic::arithmetic_fence, DstType, Val, nullptr,
+ Name);
+ }
+
/// Create a call to the experimental.vector.extract intrinsic.
CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx,
const Twine &Name = "") {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 58483ff47ba1..c7bdd86d82f8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1335,6 +1335,9 @@ def int_sideeffect : DefaultAttrsIntrinsic<[], [], [IntrInaccessibleMemOnly, Int
def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
[IntrInaccessibleMemOnly, IntrWillReturn]>;
+// Arithmetic fence intrinsic.
+def int_arithmetic_fence : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
// Intrinsics to support half precision floating point format
let IntrProperties = [IntrNoMem, IntrWillReturn] in {
def int_convert_to_fp16 : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index a153eae96519..154329f8a979 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -121,6 +121,9 @@ HANDLE_TARGET_OPCODE(LIFETIME_END)
/// Pseudo probe
HANDLE_TARGET_OPCODE(PSEUDO_PROBE)
+/// Arithmetic fence.
+HANDLE_TARGET_OPCODE(ARITH_FENCE)
+
/// A Stackmap instruction captures the location of live variables at its
/// position in the instruction stream. It is followed by a shadow of bytes
/// that must lie within the function and not contain another stackmap.
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 71c74f3d5cde..e9720d765167 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1176,6 +1176,13 @@ def PSEUDO_PROBE : StandardPseudoInstruction {
let AsmString = "PSEUDO_PROBE";
let hasSideEffects = 1;
}
+def ARITH_FENCE : StandardPseudoInstruction {
+ let OutOperandList = (outs unknown:$dst);
+ let InOperandList = (ins unknown:$src);
+ let AsmString = "";
+ let hasSideEffects = false;
+ let Constraints = "$src = $dst";
+}
def STACKMAP : StandardPseudoInstruction {
let OutOperandList = (outs);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 6b06bb88edbd..f04cbb07403b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1332,6 +1332,10 @@ void AsmPrinter::emitFunctionBody() {
case TargetOpcode::PSEUDO_PROBE:
emitPseudoProbe(MI);
break;
+ case TargetOpcode::ARITH_FENCE:
+ if (isVerbose())
+ OutStreamer->emitRawComment("ARITH_FENCE");
+ break;
default:
emitInstruction(&MI);
if (CanDoExtraAnalysis) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22adb9ae52f5..7bc8d1d2333b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -90,6 +90,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FNEARBYINT:
case ISD::FNEG:
case ISD::FREEZE:
+ case ISD::ARITH_FENCE:
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
@@ -983,6 +984,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FNEARBYINT:
case ISD::FNEG:
case ISD::FREEZE:
+ case ISD::ARITH_FENCE:
case ISD::FP_EXTEND:
case ISD::FP_ROUND:
case ISD::FP_TO_SINT:
@@ -3146,6 +3148,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::CTTZ_ZERO_UNDEF:
case ISD::FNEG:
case ISD::FREEZE:
+ case ISD::ARITH_FENCE:
case ISD::FCANONICALIZE:
Res = WidenVecRes_Unary(N);
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b63246ac671b..0774c7fcf011 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6292,6 +6292,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)), Flags));
return;
+ case Intrinsic::arithmetic_fence: {
+ setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl,
+ getValue(I.getArgOperand(0)).getValueType(),
+ getValue(I.getArgOperand(0)), Flags));
+ return;
+ }
case Intrinsic::fma:
setValue(&I, DAG.getNode(
ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e049be94fcf1..e3ff00131dbe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2325,6 +2325,11 @@ void SelectionDAGISel::Select_FREEZE(SDNode *N) {
N->getOperand(0));
}
+void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) {
+ CurDAG->SelectNodeTo(N, TargetOpcode::ARITH_FENCE, N->getValueType(0),
+ N->getOperand(0));
+}
+
/// GetVBR - decode a vbr encoding whose top bit is set.
LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
@@ -2876,6 +2881,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
case ISD::FREEZE:
Select_FREEZE(NodeToMatch);
return;
+ case ISD::ARITH_FENCE:
+ Select_ARITH_FENCE(NodeToMatch);
+ return;
}
assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence.ll b/llvm/test/CodeGen/X86/arithmetic_fence.ll
new file mode 100644
index 000000000000..eddc0cc33de9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/arithmetic_fence.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64
+
+define float @f1(float %a, float %b, float %c) {
+; X86-LABEL: f1:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; X86-NEXT: vmovss %xmm1, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f1:
+; X64: # %bb.0:
+; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; X64-NEXT: retq
+ %mul = fmul fast float %b, %a
+ %add = fadd fast float %mul, %c
+ ret float %add
+}
+
+define float @f2(float %a, float %b, float %c) {
+; X86-LABEL: f2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f2:
+; X64: # %bb.0:
+; X64-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; X64-NEXT: retq
+ %mul = fmul fast float %b, %a
+ %tmp = call float @llvm.arithmetic.fence.f32(float %mul)
+ %add = fadd fast float %tmp, %c
+ ret float %add
+}
+
+define double @f3(double %a) {
+; X86-LABEL: f3:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f3:
+; X64: # %bb.0:
+; X64-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast double %a, %a
+ %2 = fadd fast double %a, %a
+ %3 = fadd fast double %1, %2
+ ret double %3
+}
+
+define double @f4(double %a) {
+; X86-LABEL: f4:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovapd %xmm0, %xmm1
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f4:
+; X64: # %bb.0:
+; X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovapd %xmm0, %xmm1
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast double %a, %a
+ %t = call double @llvm.arithmetic.fence.f64(double %1)
+ %2 = fadd fast double %a, %a
+ %3 = fadd fast double %t, %2
+ ret double %3
+}
+
+define <2 x float> @f5(<2 x float> %a) {
+; X86-LABEL: f5:
+; X86: # %bb.0:
+; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: f5:
+; X64: # %bb.0:
+; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast <2 x float> %a, %a
+ %2 = fadd fast <2 x float> %a, %a
+ %3 = fadd fast <2 x float> %1, %2
+ ret <2 x float> %3
+}
+
+define <2 x float> @f6(<2 x float> %a) {
+; X86-LABEL: f6:
+; X86: # %bb.0:
+; X86-NEXT: vaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovaps %xmm0, %xmm1
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: f6:
+; X64: # %bb.0:
+; X64-NEXT: vaddps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovaps %xmm0, %xmm1
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast <2 x float> %a, %a
+ %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+ %2 = fadd fast <2 x float> %a, %a
+ %3 = fadd fast <2 x float> %t, %2
+ ret <2 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
new file mode 100644
index 000000000000..22dab9ffa822
--- /dev/null
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define double @f1(double %a) {
+; X86-LABEL: f1:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: movsd %xmm0, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f1:
+; X64: # %bb.0:
+; X64-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast double %a, %a
+ %2 = fadd fast double %a, %a
+ %3 = fadd fast double %1, %2
+ ret double %3
+}
+
+define double @f2(double %a) {
+; X86-LABEL: f2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: addsd %xmm0, %xmm0
+; X86-NEXT: movapd %xmm0, %xmm1
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: addsd %xmm0, %xmm1
+; X86-NEXT: movsd %xmm1, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl
+;
+; X64-LABEL: f2:
+; X64: # %bb.0:
+; X64-NEXT: addsd %xmm0, %xmm0
+; X64-NEXT: movapd %xmm0, %xmm1
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: addsd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast double %a, %a
+ %t = call double @llvm.arithmetic.fence.f64(double %1)
+ %2 = fadd fast double %a, %a
+ %3 = fadd fast double %t, %2
+ ret double %3
+}
+
+define <2 x float> @f3(<2 x float> %a) {
+; X86-LABEL: f3:
+; X86: # %bb.0:
+; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: f3:
+; X64: # %bb.0:
+; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast <2 x float> %a, %a
+ %2 = fadd fast <2 x float> %a, %a
+ %3 = fadd fast <2 x float> %1, %2
+ ret <2 x float> %3
+}
+
+define <2 x float> @f4(<2 x float> %a) {
+; X86-LABEL: f4:
+; X86: # %bb.0:
+; X86-NEXT: addps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, %xmm1
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: addps %xmm0, %xmm1
+; X86-NEXT: movaps %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: f4:
+; X64: # %bb.0:
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, %xmm1
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: addps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %1 = fadd fast <2 x float> %a, %a
+ %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+ %2 = fadd fast <2 x float> %a, %a
+ %3 = fadd fast <2 x float> %t, %2
+ ret <2 x float> %3
+}
+
+define <8 x float> @f5(<8 x float> %a) {
+; X86-LABEL: f5:
+; X86: # %bb.0:
+; X86-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X86-NEXT: mulps %xmm2, %xmm0
+; X86-NEXT: mulps %xmm2, %xmm1
+; X86-NEXT: retl
+;
+; X64-LABEL: f5:
+; X64: # %bb.0:
+; X64-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X64-NEXT: mulps %xmm2, %xmm0
+; X64-NEXT: mulps %xmm2, %xmm1
+; X64-NEXT: retq
+ %1 = fadd fast <8 x float> %a, %a
+ %2 = fadd fast <8 x float> %a, %a
+ %3 = fadd fast <8 x float> %1, %2
+ ret <8 x float> %3
+}
+
+define <8 x float> @f6(<8 x float> %a) {
+; X86-LABEL: f6:
+; X86: # %bb.0:
+; X86-NEXT: addps %xmm0, %xmm0
+; X86-NEXT: addps %xmm1, %xmm1
+; X86-NEXT: movaps %xmm1, %xmm2
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: movaps %xmm0, %xmm3
+; X86-NEXT: #ARITH_FENCE
+; X86-NEXT: addps %xmm0, %xmm3
+; X86-NEXT: addps %xmm1, %xmm2
+; X86-NEXT: movaps %xmm3, %xmm0
+; X86-NEXT: movaps %xmm2, %xmm1
+; X86-NEXT: retl
+;
+; X64-LABEL: f6:
+; X64: # %bb.0:
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: addps %xmm1, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm2
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: movaps %xmm0, %xmm3
+; X64-NEXT: #ARITH_FENCE
+; X64-NEXT: addps %xmm0, %xmm3
+; X64-NEXT: addps %xmm1, %xmm2
+; X64-NEXT: movaps %xmm3, %xmm0
+; X64-NEXT: movaps %xmm2, %xmm1
+; X64-NEXT: retq
+ %1 = fadd fast <8 x float> %a, %a
+ %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1)
+ %2 = fadd fast <8 x float> %a, %a
+ %3 = fadd fast <8 x float> %t, %2
+ ret <8 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
+declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>)
More information about the llvm-commits
mailing list