[llvm] r204768 - [PowerPC] Select between VSX A-type and M-type FMA instructions just before RA

Tue Mar 25 16:29:21 PDT 2014

Author: hfinkel
Date: Tue Mar 25 18:29:21 2014
New Revision: 204768

URL: http://llvm.org/viewvc/llvm-project?rev=204768&view=rev
Log:
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA

The VSX instruction set has two types of FMA instructions: A-type (where the
addend is taken from the output register) and M-type (where one of the product
operands is taken from the output register). This adds a small pass that runs
just after MI scheduling (and, thus, just before register allocation) that
mutates A-type instructions (that are created during isel) into M-type
instructions when:

 1. This will eliminate an otherwise-necessary copy of the addend

 2. One of the product operands is killed by the instruction

The "right" moment to make this decision is in between scheduling and register
allocation, because only there do we know whether or not one of the product
operands is killed by any particular instruction. Unfortunately, this also
makes the implementation somewhat complicated, because the MIs are not in SSA
form and we need to preserve the LiveIntervals analysis.

As a simple example, if we have:

%vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
%vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
                        %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
  ...
  %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
                        %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
  ...

We can eliminate the copy by changing from the A-type to the
M-type instruction. This means:

  %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
                        %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16

is replaced by:

  %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
                        %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9

and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9

Added:
    llvm/trunk/test/CodeGen/PowerPC/vsx-fma-m.ll
Modified:
    llvm/trunk/lib/Target/PowerPC/PPC.h
    llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp
    llvm/trunk/lib/Target/PowerPC/PPCTargetMachine.cpp

Modified: llvm/trunk/lib/Target/PowerPC/PPC.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPC.h?rev=204768&r1=204767&r2=204768&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPC.h (original)
+++ llvm/trunk/lib/Target/PowerPC/PPC.h Tue Mar 25 18:29:21 2014
@@ -23,6 +23,7 @@
 
 namespace llvm {
   class PPCTargetMachine;
+  class PassRegistry;
   class FunctionPass;
   class ImmutablePass;
   class JITCodeEmitter;
@@ -36,6 +37,7 @@ namespace llvm {
 #endif
   FunctionPass *createPPCEarlyReturnPass();
   FunctionPass *createPPCVSXCopyPass();
+  FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
@@ -46,6 +48,9 @@ namespace llvm {
   /// \brief Creates an PPC-specific Target Transformation Info pass.
   ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
 
+  void initializePPCVSXFMAMutatePass(PassRegistry&);
+  extern char &PPCVSXFMAMutateID;
+
   namespace PPCII {
     
   /// Target Operand Flag enum.

Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp?rev=204768&r1=204767&r2=204768&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp Tue Mar 25 18:29:21 2014
@@ -20,14 +20,17 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,6 +48,9 @@ opt<bool> DisableCTRLoopAnal("disable-pp
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
+static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
+cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -1564,6 +1570,263 @@ unsigned PPCInstrInfo::GetInstSizeInByte
   }
 }
 
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace {
+  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXFMAMutate : public MachineFunctionPass {
+    static char ID;
+    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    }
+
+    LiveIntervals *LIS;
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+
+        // The default (A-type) VSX FMA form kills the addend (it is taken from
+        // the target register, which is then updated to reflect the result of
+        // the FMA). If the instruction, however, kills one of the registers
+        // used for the product, then we can use the M-form instruction (which
+        // will take that value from the to-be-defined register).
+
+        int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+        if (AltOpc == -1)
+          continue;
+
+        // This pass is run after register coalescing, and so we're looking for
+        // a situation like this:
+        //   ...
+        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   ...
+        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   ...
+        // Where we can eliminate the copy by changing from the A-type to the
+        // M-type instruction. Specifically, for this example, this means:
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        // is replaced by:
+        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+        VNInfo *AddendValNo =
+          LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
+        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+        // The addend and this instruction must be in the same block.
+
+        if (AddendMI->getParent() != MI->getParent())
+          continue;
+
+        // The addend must be a full copy within the same register class.
+
+        if (!AddendMI->isFullCopy())
+          continue;
+
+        if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+            MRI.getRegClass(AddendMI->getOperand(1).getReg()))
+          continue;
+
+        // In theory, there could be other uses of the addend copy before this
+        // fma.  We could deal with this, but that would require additional
+        // logic below and I suspect it will not occur in any relevant
+        // situations.
+        bool OtherUsers = false;
+        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+             J != JE; --J)
+          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+            OtherUsers = true;
+            break;
+          }
+
+        if (OtherUsers)
+          continue;
+
+        // Find one of the product operands that is killed by this instruction.
+
+        unsigned KilledProdOp = 0, OtherProdOp = 0;
+        if (LIS->getInterval(MI->getOperand(2).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 2;
+          OtherProdOp  = 3;
+        } else if (LIS->getInterval(MI->getOperand(3).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 3;
+          OtherProdOp  = 2;
+        }
+
+	// If there are no killed product operands, then this transformation is
+	// likely not profitable.
+        if (!KilledProdOp)
+          continue;
+
+        // In order to replace the addend here with the source of the copy,
+        // it must still be live here.
+        if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx))
+          continue;
+
+        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+        unsigned AddReg = AddendMI->getOperand(1).getReg();
+        unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
+        unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
+
+        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+        unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
+        unsigned OtherProdSubReg  = MI->getOperand(OtherProdOp).getSubReg();
+
+        bool AddRegKill = AddendMI->getOperand(1).isKill();
+        bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
+        bool OtherProdRegKill  = MI->getOperand(OtherProdOp).isKill();
+
+        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+        bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
+        bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
+
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+               "Addend copy not tied to old FMA output!");
+
+        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << *MI;);
+
+        MI->getOperand(0).setReg(KilledProdReg);
+        MI->getOperand(1).setReg(KilledProdReg);
+        MI->getOperand(3).setReg(AddReg);
+        MI->getOperand(2).setReg(OtherProdReg);
+
+        MI->getOperand(0).setSubReg(KilledProdSubReg);
+        MI->getOperand(1).setSubReg(KilledProdSubReg);
+        MI->getOperand(3).setSubReg(AddSubReg);
+        MI->getOperand(2).setSubReg(OtherProdSubReg);
+
+        MI->getOperand(1).setIsKill(KilledProdRegKill);
+        MI->getOperand(3).setIsKill(AddRegKill);
+        MI->getOperand(2).setIsKill(OtherProdRegKill);
+
+        MI->getOperand(1).setIsUndef(KilledProdRegUndef);
+        MI->getOperand(3).setIsUndef(AddRegUndef);
+        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+
+        MI->setDesc(TII->get(AltOpc));
+
+        DEBUG(dbgs() << " -> " << *MI);
+
+        // The killed product operand was killed here, so we can reuse it now
+        // for the result of the fma.
+
+        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+             UI != UE;) {
+          MachineOperand &UseMO = *UI;
+          MachineInstr *UseMI = UseMO.getParent();
+          ++UI;
+
+          // Don't replace the result register of the copy we're about to erase.
+          if (UseMI == AddendMI)
+            continue;
+
+          UseMO.setReg(KilledProdReg);
+          UseMO.setSubReg(KilledProdSubReg);
+        }
+
+        // Extend the live intervals of the killed product operand to hold the
+        // fma result.
+
+        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+             AI != AE; ++AI) {
+          // Don't add the segment that corresponds to the original copy.
+          if (AI->valno == AddendValNo)
+            continue;
+
+          VNInfo *NewFMAValNo =
+            NewFMAInt.getNextValue(AI->start,
+                                   LIS->getVNInfoAllocator());
+
+          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+                                                     NewFMAValNo));
+        }
+        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+
+        FMAInt.removeValNo(FMAValNo);
+        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+
+        // Remove the (now unused) copy.
+
+        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LIS->RemoveMachineInstrFromMaps(AddendMI);
+        AddendMI->eraseFromParent();
+
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      LIS = &getAnalysis<LiveIntervals>();
+
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      if (DisableVSXFMAMutate)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+                      "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+                    "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass*
+llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "ppc-vsx-copy"

Modified: llvm/trunk/lib/Target/PowerPC/PPCTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCTargetMachine.cpp?rev=204768&r1=204767&r2=204768&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetMachine.cpp Tue Mar 25 18:29:21 2014
@@ -26,6 +26,10 @@ static cl::
 opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
                         cl::desc("Disable CTR loops for PPC"));
 
+static cl::opt<bool>
+VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
+  cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
@@ -126,6 +130,7 @@ public:
   virtual bool addPreISel();
   virtual bool addILPOpts();
   virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
   virtual bool addPreSched2();
   virtual bool addPreEmitPass();
 };
@@ -165,6 +170,16 @@ bool PPCPassConfig::addInstSelector() {
 
   return false;
 }
+
+bool PPCPassConfig::addPreRegAlloc() {
+  if (getPPCSubtarget().hasVSX()) {
+    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+               &PPCVSXFMAMutateID);
+  }
+
+  return false;
+}
 
 bool PPCPassConfig::addPreSched2() {
   if (getOptLevel() != CodeGenOpt::None)

Added: llvm/trunk/test/CodeGen/PowerPC/vsx-fma-m.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/vsx-fma-m.ll?rev=204768&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vsx-fma-m.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/vsx-fma-m.ll Tue Mar 25 18:29:21 2014
@@ -0,0 +1,124 @@
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+
+; Also run with -schedule-ppc-vsx-fma-mutation-early as a stress test for the
+; live-interval-updating logic.
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -schedule-ppc-vsx-fma-mutation-early
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @test1(double %a, double %b, double %c, double %e, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: xsmaddmdp 3, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 4
+; CHECK-DAG: stxsdx 3, 0, 7
+; CHECK-DAG: stxsdx 1, 7, [[C1]]
+; CHECK: blr
+}
+
+define void @test2(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  %2 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx2 = getelementptr inbounds double* %d, i64 2
+  store double %2, double* %arrayidx2, align 8
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: xsmaddmdp 3, 2, 1
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 5
+; CHECK-DAG: stxsdx 3, 0, 8
+; CHECK-DAG: stxsdx 4, 8, [[C1]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK: blr
+}
+
+define void @test3(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 3
+  store double %2, double* %arrayidx1, align 8
+  %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx2 = getelementptr inbounds double* %d, i64 2
+  store double %3, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx3, align 8
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: xxlor [[F1:[0-9]+]], 1, 1
+; CHECK-DAG: li [[C1:[0-9]+]], 24
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: li [[C3:[0-9]+]], 8
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 5
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddadp [[F1]], 2, 3
+
+; CHECK-DAG: xsmaddmdp 2, 3, 4
+; CHECK-DAG: stxsdx [[F1]], 0, 8
+; CHECK-DAG: stxsdx 2, 8, [[C1]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK-DAG: stxsdx 4, 8, [[C3]]
+; CHECK-DAG: blr
+}
+
+define void @test4(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
+  %arrayidx3 = getelementptr inbounds double* %d, i64 3
+  store double %2, double* %arrayidx3, align 8
+  %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx4 = getelementptr inbounds double* %d, i64 2
+  store double %3, double* %arrayidx4, align 8
+  ret void
+
+; CHECK-LABEL: @test4
+; CHECK-DAG: xxlor [[F1:[0-9]+]], 1, 1
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddadp 1, 2, 5
+
+; CHECK-DAG: xsmaddadp [[F1]], 2, 3
+; CHECK-DAG: stxsdx [[F1]], 0, 8
+; CHECK-DAG: stxsdx 4, 8, [[C1]]
+; CHECK-DAG: li [[C3:[0-9]+]], 24
+; CHECK-DAG: xsmaddadp 4, 2, 3
+; CHECK-DAG: stxsdx 4, 8, [[C3]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK: blr
+}
+
+declare double @llvm.fma.f64(double, double, double) #0
+
+attributes #0 = { nounwind readnone }
+