[llvm] 931e956 - [llvm][clang][fpenv] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

Mon Jun 28 09:27:08 PDT 2021

Author: Melanie Blower
Date: 2021-06-28T12:26:52-04:00
New Revision: 931e95687d6df71aa8a33376fd2d566c8153be24

URL: https://github.com/llvm/llvm-project/commit/931e95687d6df71aa8a33376fd2d566c8153be24
DIFF: https://github.com/llvm/llvm-project/commit/931e95687d6df71aa8a33376fd2d566c8153be24.diff

LOG: [llvm][clang][fpenv] Create new intrinsic llvm.arith.fence to control FP optimization at expression level

This intrinsic blocks floating point transformations by the optimizer.

Author: Pengfei

Reviewed By: LuoYuanke, Andy Kaylor, Craig Topper, kpn

Differential Revision: https://reviews.llvm.org/D99675

Added: 
    llvm/test/CodeGen/X86/arithmetic_fence.ll
    llvm/test/CodeGen/X86/arithmetic_fence2.ll

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/include/llvm/CodeGen/ISDOpcodes.h
    llvm/include/llvm/CodeGen/SelectionDAGISel.h
    llvm/include/llvm/IR/IRBuilder.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/Support/TargetOpcodes.def
    llvm/include/llvm/Target/Target.td
    llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index fc9bf536a9fd..1986f232cc3e 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21453,6 +21453,42 @@ If the function's return value's second element is false, the value of the
 first element is undefined.
 
 
+'``llvm.arithmetic.fence``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.arithmetic.fence(<type> <op>)
+
+Overview:
+"""""""""
+
+The purpose of the ``llvm.arithmetic.fence`` intrinsic
+is to prevent the optimizer from performaing fast-math optimizations,
+particularly reassociation,
+between the argument and the expression that contains the argument.
+It can be used to preserve the parentheses in the source language.
+
+Arguments:
+""""""""""
+
+The ``llvm.arithmetic.fence`` intrinsic takes only one argument.
+The argument and the return value are floating-point numbers,
+or vector floating-point numbers, of the same type.
+
+Semantics:
+""""""""""
+
+This intrinsic returns the value of its operand. The optimizer can optimize
+the argument, but the optimizer cannot hoist any component of the operand
+to the containing context, and the optimizer cannot move the calculation of
+any expression in the containing context into the operand.
+
+
 '``llvm.donothing``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2e9e27fcb86e..657e8d81aa73 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -576,6 +576,7 @@ class TargetTransformInfoImplBase {
     case Intrinsic::assume:
     case Intrinsic::sideeffect:
     case Intrinsic::pseudoprobe:
+    case Intrinsic::arithmetic_fence:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
     case Intrinsic::dbg_label:

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c6d92ad7f99d..85486474846d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1609,6 +1609,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::lifetime_end:
     case Intrinsic::sideeffect:
     case Intrinsic::pseudoprobe:
+    case Intrinsic::arithmetic_fence:
       return 0;
     case Intrinsic::masked_store: {
       Type *Ty = Tys[0];

diff  --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6eb70ab47708..8ff83043e705 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1097,6 +1097,10 @@ enum NodeType {
   /// specifier.
   PREFETCH,
 
+  /// ARITH_FENCE - This corresponds to a arithmetic fence intrinsic. Both its
+  /// operand and output are the same floating type.
+  ARITH_FENCE,
+
   /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
   /// This corresponds to the fence instruction. It takes an input chain, and
   /// two integer constants: an AtomicOrdering and a SynchronizationScope.

diff  --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index f6afa5eedc8d..94ba6ad91517 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -318,6 +318,7 @@ class SelectionDAGISel : public MachineFunctionPass {
   void CannotYetSelect(SDNode *N);
 
   void Select_FREEZE(SDNode *N);
+  void Select_ARITH_FENCE(SDNode *N);
 
 private:
   void DoInstructionSelection();

diff  --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 7fb504ad0d66..aa7c90f932f8 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -905,6 +905,13 @@ class IRBuilderBase {
     return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
   }
 
+  /// Create a call to the arithmetic_fence intrinsic.
+  CallInst *CreateArithmeticFence(Value *Val, Type *DstType,
+                                  const Twine &Name = "") {
+    return CreateIntrinsic(Intrinsic::arithmetic_fence, DstType, Val, nullptr,
+                           Name);
+  }
+
   /// Create a call to the experimental.vector.extract intrinsic.
   CallInst *CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx,
                                 const Twine &Name = "") {

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 58483ff47ba1..c7bdd86d82f8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1335,6 +1335,9 @@ def int_sideeffect : DefaultAttrsIntrinsic<[], [], [IntrInaccessibleMemOnly, Int
 def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
                                     [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
+// Arithmetic fence intrinsic.
+def int_arithmetic_fence : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
 // Intrinsics to support half precision floating point format
 let IntrProperties = [IntrNoMem, IntrWillReturn] in {
 def int_convert_to_fp16   : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;

diff  --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index a153eae96519..154329f8a979 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -121,6 +121,9 @@ HANDLE_TARGET_OPCODE(LIFETIME_END)
 /// Pseudo probe
 HANDLE_TARGET_OPCODE(PSEUDO_PROBE)
 
+/// Arithmetic fence.
+HANDLE_TARGET_OPCODE(ARITH_FENCE)
+
 /// A Stackmap instruction captures the location of live variables at its
 /// position in the instruction stream. It is followed by a shadow of bytes
 /// that must lie within the function and not contain another stackmap.

diff  --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 71c74f3d5cde..e9720d765167 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1176,6 +1176,13 @@ def PSEUDO_PROBE : StandardPseudoInstruction {
   let AsmString = "PSEUDO_PROBE";
   let hasSideEffects = 1;
 }
+def ARITH_FENCE : StandardPseudoInstruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins unknown:$src);
+  let AsmString = "";
+  let hasSideEffects = false;
+  let Constraints = "$src = $dst";
+}
 
 def STACKMAP : StandardPseudoInstruction {
   let OutOperandList = (outs);

diff  --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 6b06bb88edbd..f04cbb07403b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1332,6 +1332,10 @@ void AsmPrinter::emitFunctionBody() {
       case TargetOpcode::PSEUDO_PROBE:
         emitPseudoProbe(MI);
         break;
+      case TargetOpcode::ARITH_FENCE:
+        if (isVerbose())
+          OutStreamer->emitRawComment("ARITH_FENCE");
+        break;
       default:
         emitInstruction(&MI);
         if (CanDoExtraAnalysis) {

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22adb9ae52f5..7bc8d1d2333b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -90,6 +90,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
@@ -983,6 +984,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FP_TO_SINT:
@@ -3146,6 +3148,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::FNEG:
   case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
   case ISD::FCANONICALIZE:
     Res = WidenVecRes_Unary(N);
     break;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b63246ac671b..0774c7fcf011 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6292,6 +6292,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)), Flags));
     return;
+  case Intrinsic::arithmetic_fence: {
+    setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)), Flags));
+    return;
+  }
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(
                      ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e049be94fcf1..e3ff00131dbe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2325,6 +2325,11 @@ void SelectionDAGISel::Select_FREEZE(SDNode *N) {
                        N->getOperand(0));
 }
 
+void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::ARITH_FENCE, N->getValueType(0),
+                       N->getOperand(0));
+}
+
 /// GetVBR - decode a vbr encoding whose top bit is set.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
@@ -2876,6 +2881,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::FREEZE:
     Select_FREEZE(NodeToMatch);
     return;
+  case ISD::ARITH_FENCE:
+    Select_ARITH_FENCE(NodeToMatch);
+    return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");

diff  --git a/llvm/test/CodeGen/X86/arithmetic_fence.ll b/llvm/test/CodeGen/X86/arithmetic_fence.ll
new file mode 100644
index 000000000000..eddc0cc33de9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/arithmetic_fence.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefix=X64
+
+define float @f1(float %a, float %b, float %c) {
+; X86-LABEL: f1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; X86-NEXT:    vmovss %xmm1, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0:
+; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; X64-NEXT:    retq
+  %mul = fmul fast float %b, %a
+  %add = fadd fast float %mul, %c
+  ret float %add
+}
+
+define float @f2(float %a, float %b, float %c) {
+; X86-LABEL: f2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddss %xmm2, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %mul = fmul fast float %b, %a
+  %tmp = call float @llvm.arithmetic.fence.f32(float %mul)
+  %add = fadd fast float %tmp, %c
+  ret float %add
+}
+
+define double @f3(double %a) {
+; X86-LABEL: f3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f4(double %a) {
+; X86-LABEL: f4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovapd %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0:
+; X64-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovapd %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f5(<2 x float> %a) {
+; X86-LABEL: f5:
+; X86:       # %bb.0:
+; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f5:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %1, %2
+  ret <2 x float> %3
+}
+
+define <2 x float> @f6(<2 x float> %a) {
+; X86-LABEL: f6:
+; X86:       # %bb.0:
+; X86-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f6:
+; X64:       # %bb.0:
+; X64-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovaps %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)

diff  --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
new file mode 100644
index 000000000000..22dab9ffa822
--- /dev/null
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define double @f1(double %a) {
+; X86-LABEL: f1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0:
+; X64-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %1, %2
+  ret double %3
+}
+
+define double @f2(double %a) {
+; X86-LABEL: f2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    addsd %xmm0, %xmm0
+; X86-NEXT:    movapd %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    addsd %xmm0, %xmm1
+; X86-NEXT:    movsd %xmm1, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0:
+; X64-NEXT:    addsd %xmm0, %xmm0
+; X64-NEXT:    movapd %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    addsd %xmm0, %xmm1
+; X64-NEXT:    movapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast double %a, %a
+  %t = call double @llvm.arithmetic.fence.f64(double %1)
+  %2 = fadd fast double %a, %a
+  %3 = fadd fast double %t, %2
+  ret double %3
+}
+
+define <2 x float> @f3(<2 x float> %a) {
+; X86-LABEL: f3:
+; X86:       # %bb.0:
+; X86-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0:
+; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %1, %2
+  ret <2 x float> %3
+}
+
+define <2 x float> @f4(<2 x float> %a) {
+; X86-LABEL: f4:
+; X86:       # %bb.0:
+; X86-NEXT:    addps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    addps %xmm0, %xmm1
+; X86-NEXT:    movaps %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    addps %xmm0, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = fadd fast <2 x float> %a, %a
+  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
+  %2 = fadd fast <2 x float> %a, %a
+  %3 = fadd fast <2 x float> %t, %2
+  ret <2 x float> %3
+}
+
+define <8 x float> @f5(<8 x float> %a) {
+; X86-LABEL: f5:
+; X86:       # %bb.0:
+; X86-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X86-NEXT:    mulps %xmm2, %xmm0
+; X86-NEXT:    mulps %xmm2, %xmm1
+; X86-NEXT:    retl
+;
+; X64-LABEL: f5:
+; X64:       # %bb.0:
+; X64-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; X64-NEXT:    mulps %xmm2, %xmm0
+; X64-NEXT:    mulps %xmm2, %xmm1
+; X64-NEXT:    retq
+  %1 = fadd fast <8 x float> %a, %a
+  %2 = fadd fast <8 x float> %a, %a
+  %3 = fadd fast <8 x float> %1, %2
+  ret <8 x float> %3
+}
+
+define <8 x float> @f6(<8 x float> %a) {
+; X86-LABEL: f6:
+; X86:       # %bb.0:
+; X86-NEXT:    addps %xmm0, %xmm0
+; X86-NEXT:    addps %xmm1, %xmm1
+; X86-NEXT:    movaps %xmm1, %xmm2
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    movaps %xmm0, %xmm3
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    addps %xmm0, %xmm3
+; X86-NEXT:    addps %xmm1, %xmm2
+; X86-NEXT:    movaps %xmm3, %xmm0
+; X86-NEXT:    movaps %xmm2, %xmm1
+; X86-NEXT:    retl
+;
+; X64-LABEL: f6:
+; X64:       # %bb.0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    addps %xmm1, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm2
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    movaps %xmm0, %xmm3
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    addps %xmm0, %xmm3
+; X64-NEXT:    addps %xmm1, %xmm2
+; X64-NEXT:    movaps %xmm3, %xmm0
+; X64-NEXT:    movaps %xmm2, %xmm1
+; X64-NEXT:    retq
+  %1 = fadd fast <8 x float> %a, %a
+  %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1)
+  %2 = fadd fast <8 x float> %a, %a
+  %3 = fadd fast <8 x float> %t, %2
+  ret <8 x float> %3
+}
+
+declare float @llvm.arithmetic.fence.f32(float)
+declare double @llvm.arithmetic.fence.f64(double)
+declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
+declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>)