[llvm] r192915 - Refactor lowering for COPY_STRUCT_BYVAL_I32

Thu Oct 17 12:49:22 PDT 2013

Author: dpeixott
Date: Thu Oct 17 14:49:22 2013
New Revision: 192915

URL: http://llvm.org/viewvc/llvm-project?rev=192915&view=rev
Log:
Refactor lowering for COPY_STRUCT_BYVAL_I32

This commit refactors the lowering of the COPY_STRUCT_BYVAL_I32
pseudo-instruction in the ARM backend. We introduce a new helper
class that encapsulates all of the operations needed during the
lowering. The operations are implemented for each subtarget in
different subclasses. Currently only arm and thumb2 subtargets are
supported.

This refactoring was done to easily implement support for thumb1
subtargets. This initial patch does not add support for thumb1, but
is only a refactoring. A follow on patch will implement the support
for thumb1 subtargets.

No intended functionality change.

Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=192915&r1=192914&r2=192915&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Oct 17 14:49:22 2013
@@ -48,6 +48,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <utility>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -7245,8 +7246,430 @@ MachineBasicBlock *OtherSucc(MachineBasi
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
-MachineBasicBlock *ARMTargetLowering::
-EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+namespace {
+// This class is a helper for lowering the COPY_STRUCT_BYVAL_I32 instruction.
+// It defines the operations needed to lower the byval copy. We use a helper
+// class because the opcodes and machine instructions are different for each
+// subtarget, but the overall algorithm for the lowering is the same.  The
+// implementation of each operation will be defined separately for arm, thumb1,
+// and thumb2 targets by subclassing this base class. See
+// ARMTargetLowering::EmitStructByval() for how these operations are used.
+class TargetStructByvalEmitter {
+public:
+  TargetStructByvalEmitter(const TargetInstrInfo *TII_,
+                           MachineRegisterInfo &MRI_,
+                           const TargetRegisterClass *TRC_)
+      : TII(TII_), MRI(MRI_), TRC(TRC_) {}
+
+  // Emit a post-increment load of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (4, 2, or 1 bytes). Alignments higher
+  // than 4 are handled separately by using NEON instructions.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                DebugLoc &dl, unsigned baseReg,
+                                unsigned baseOut) = 0;
+
+  // Emit a post-increment store of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (4, 2, or 1 bytes). Alignments higher
+  // than 4 are handled separately by using NEON instructions.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address.
+  virtual void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI,
+                             DebugLoc &dl, unsigned baseReg, unsigned storeReg,
+                             unsigned baseOut) = 0;
+
+  // Emit a post-increment load of one byte.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                DebugLoc &dl, unsigned baseReg,
+                                unsigned baseOut) = 0;
+
+  // Emit a post-increment store of one byte.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address.
+  virtual void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI,
+                             DebugLoc &dl, unsigned baseReg, unsigned storeReg,
+                             unsigned baseOut) = 0;
+
+  // Emit a load of a constant value.
+  //
+  // \param Constant the register holding the address to store.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                    DebugLoc &dl, unsigned Constant,
+                                    const DataLayout *DL) = 0;
+
+  // Emit a subtract of a register minus immediate, with the immediate equal to
+  // the "unit" size. The unit size is based on the alignment of the struct
+  // being copied (16, 8, 4, 2, or 1 bytes).
+  //
+  // \param InReg the register holding the initial value.
+  // \param OutReg the register to recieve the subtracted value.
+  virtual void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                          unsigned InReg, unsigned OutReg) = 0;
+
+  // Emit a branch based on a condition code of not equal.
+  //
+  // \param TargetBB the destination of the branch.
+  virtual void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, MachineBasicBlock *TargetBB) = 0;
+
+  // Find the constant pool index for the given constant. This method is
+  // implemented in the base class because it is the same for all subtargets.
+  //
+  // \param LoopSize the constant value for which the index should be returned.
+  // \returns the constant pool index for the constant.
+  unsigned getConstantPoolIndex(MachineFunction *MF, const DataLayout *DL,
+                                unsigned LoopSize) {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = DL->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = DL->getTypeAllocSize(C->getType());
+    return ConstantPool->getConstantPoolIndex(C, Align);
+  }
+
+  // Return the register class used by the subtarget.
+  //
+  // \returns the target register class.
+  const TargetRegisterClass *getTRC() const { return TRC; }
+
+  virtual ~TargetStructByvalEmitter() {};
+
+protected:
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo &MRI;
+  const TargetRegisterClass *TRC;
+};
+
+class ARMStructByvalEmitter : public TargetStructByvalEmitter {
+public:
+  ARMStructByvalEmitter(const TargetInstrInfo *TII, MachineRegisterInfo &MRI,
+                        unsigned LoadStoreSize)
+      : TargetStructByvalEmitter(
+            TII, MRI, (const TargetRegisterClass *)&ARM::GPRRegClass),
+        UnitSize(LoadStoreSize),
+        UnitLdOpc(LoadStoreSize == 4
+                      ? ARM::LDR_POST_IMM
+                      : LoadStoreSize == 2
+                            ? ARM::LDRH_POST
+                            : LoadStoreSize == 1 ? ARM::LDRB_POST_IMM : 0),
+        UnitStOpc(LoadStoreSize == 4
+                      ? ARM::STR_POST_IMM
+                      : LoadStoreSize == 2
+                            ? ARM::STRH_POST
+                            : LoadStoreSize == 1 ? ARM::STRB_POST_IMM : 0) {}
+
+  unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitLdOpc), scratch).addReg(
+        baseOut, RegState::Define).addReg(baseReg).addReg(0).addImm(UnitSize));
+    return scratch;
+  }
+
+  void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitStOpc), baseOut).addReg(
+        storeReg).addReg(baseReg).addReg(0).addImm(UnitSize));
+  }
+
+  unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRB_POST_IMM), scratch)
+                       .addReg(baseOut, RegState::Define).addReg(baseReg)
+                       .addReg(0).addImm(1));
+    return scratch;
+  }
+
+  void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::STRB_POST_IMM), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addReg(0).addImm(1));
+  }
+
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned Constant,
+                            const DataLayout *DL) {
+    unsigned constReg = MRI.createVirtualRegister(TRC);
+    unsigned Idx = getConstantPoolIndex(BB->getParent(), DL, Constant);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+        constReg, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+    return constReg;
+  }
+
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, dl, TII->get(ARM::SUBri), OutReg);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(InReg).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    BuildMI(*BB, MI, dl, TII->get(ARM::Bcc)).addMBB(TargetBB).addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  }
+
+private:
+  const unsigned UnitSize;
+  const unsigned UnitLdOpc;
+  const unsigned UnitStOpc;
+};
+
+class Thumb2StructByvalEmitter : public TargetStructByvalEmitter {
+public:
+  Thumb2StructByvalEmitter(const TargetInstrInfo *TII, MachineRegisterInfo &MRI,
+                           unsigned LoadStoreSize)
+      : TargetStructByvalEmitter(
+            TII, MRI, (const TargetRegisterClass *)&ARM::tGPRRegClass),
+        UnitSize(LoadStoreSize),
+        UnitLdOpc(LoadStoreSize == 4
+                      ? ARM::t2LDR_POST
+                      : LoadStoreSize == 2
+                            ? ARM::t2LDRH_POST
+                            : LoadStoreSize == 1 ? ARM::t2LDRB_POST : 0),
+        UnitStOpc(LoadStoreSize == 4
+                      ? ARM::t2STR_POST
+                      : LoadStoreSize == 2
+                            ? ARM::t2STRH_POST
+                            : LoadStoreSize == 1 ? ARM::t2STRB_POST : 0) {}
+
+  unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitLdOpc), scratch).addReg(
+        baseOut, RegState::Define).addReg(baseReg).addImm(UnitSize));
+    return scratch;
+  }
+
+  void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitStOpc), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addImm(UnitSize));
+  }
+
+  unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::t2LDRB_POST), scratch)
+                       .addReg(baseOut, RegState::Define).addReg(baseReg)
+                       .addImm(1));
+    return scratch;
+  }
+
+  void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::t2STRB_POST), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addImm(1));
+  }
+
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned Constant,
+                            const DataLayout *DL) {
+    unsigned VConst = MRI.createVirtualRegister(TRC);
+    unsigned Vtmp = VConst;
+    if ((Constant & 0xFFFF0000) != 0)
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(Constant & 0xFFFF));
+
+    if ((Constant & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), VConst)
+                         .addReg(Vtmp).addImm(Constant >> 16));
+    return VConst;
+  }
+
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, dl, TII->get(ARM::t2SUBri), OutReg);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(InReg).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    BuildMI(BB, dl, TII->get(ARM::t2Bcc)).addMBB(TargetBB).addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  }
+
+private:
+  const unsigned UnitSize;
+  const unsigned UnitLdOpc;
+  const unsigned UnitStOpc;
+};
+
+// This class is a thin wrapper that delegates most of the work to the correct
+// TargetStructByvalEmitter implementation. It also handles the lowering for
+// targets that support neon because the neon implementation is the same for all
+// targets that support it.
+class StructByvalEmitter {
+public:
+  StructByvalEmitter(unsigned LoadStoreSize, const ARMSubtarget *Subtarget,
+                     const TargetInstrInfo *TII_, MachineRegisterInfo &MRI_,
+                     const DataLayout *DL_)
+      : UnitSize(LoadStoreSize),
+        TargetEmitter(
+          Subtarget->isThumb2()
+              ? static_cast<TargetStructByvalEmitter *>(
+                    new Thumb2StructByvalEmitter(TII_, MRI_,
+                                                 LoadStoreSize))
+              : static_cast<TargetStructByvalEmitter *>(
+                    new ARMStructByvalEmitter(TII_, MRI_,
+                                              LoadStoreSize))),
+        TII(TII_), MRI(MRI_), DL(DL_),
+        VecTRC(UnitSize == 16
+                   ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                   : UnitSize == 8
+                         ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                         : 0),
+        VecLdOpc(UnitSize == 16 ? ARM::VLD1q32wb_fixed
+                                : UnitSize == 8 ? ARM::VLD1d32wb_fixed : 0),
+        VecStOpc(UnitSize == 16 ? ARM::VST1q32wb_fixed
+                                : UnitSize == 8 ? ARM::VST1d32wb_fixed : 0) {}
+
+  // Emit a post-increment load of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (16, 8, 4, 2, or 1 bytes). Loads of 16
+  // or 8 bytes use NEON instructions to load the value.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address. If baseOut
+  // is 0 then a new register is created to hold the incremented address.
+  // \returns a pair of registers holding the loaded value and the updated
+  // address.
+  std::pair<unsigned, unsigned> emitUnitLoad(MachineBasicBlock *BB,
+                                             MachineInstr *MI, DebugLoc &dl,
+                                             unsigned baseReg,
+                                             unsigned baseOut = 0) {
+    unsigned scratch = 0;
+    if (baseOut == 0)
+      baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    if (UnitSize >= 8) { // neon
+      scratch = MRI.createVirtualRegister(VecTRC);
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(VecLdOpc), scratch).addReg(
+          baseOut, RegState::Define).addReg(baseReg).addImm(0));
+    } else {
+      scratch = TargetEmitter->emitUnitLoad(BB, MI, dl, baseReg, baseOut);
+    }
+    return std::make_pair(scratch, baseOut);
+  }
+
+  // Emit a post-increment store of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (16, 8, 4, 2, or 1 bytes). Stores of
+  // 16 or 8 bytes use NEON instructions to store the value.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address. If baseOut
+  // is 0 then a new register is created to hold the incremented address.
+  // \returns the register holding the updated address.
+  unsigned emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                         unsigned baseReg, unsigned storeReg,
+                         unsigned baseOut = 0) {
+    if (baseOut == 0)
+      baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    if (UnitSize >= 8) { // neon
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(VecStOpc), baseOut)
+                         .addReg(baseReg).addImm(0).addReg(storeReg));
+    } else {
+      TargetEmitter->emitUnitStore(BB, MI, dl, baseReg, storeReg, baseOut);
+    }
+    return baseOut;
+  }
+
+  // Emit a post-increment load of one byte.
+  //
+  // \param baseReg the register holding the address to load.
+  // \returns a pair of registers holding the loaded value and the updated
+  // address.
+  std::pair<unsigned, unsigned> emitByteLoad(MachineBasicBlock *BB,
+                                             MachineInstr *MI, DebugLoc &dl,
+                                             unsigned baseReg) {
+    unsigned baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    unsigned scratch =
+        TargetEmitter->emitByteLoad(BB, MI, dl, baseReg, baseOut);
+    return std::make_pair(scratch, baseOut);
+  }
+
+  // Emit a post-increment store of one byte.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \returns the register holding the updated address.
+  unsigned emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                         unsigned baseReg, unsigned storeReg) {
+    unsigned baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    TargetEmitter->emitByteStore(BB, MI, dl, baseReg, storeReg, baseOut);
+    return baseOut;
+  }
+
+  // Emit a load of the constant LoopSize.
+  //
+  // \param LoopSize the constant to load.
+  // \returns the register holding the loaded constant.
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned LoopSize) {
+    return TargetEmitter->emitConstantLoad(BB, MI, dl, LoopSize, DL);
+  }
+
+  // Emit a subtract of a register minus immediate, with the immediate equal to
+  // the "unit" size. The unit size is based on the alignment of the struct
+  // being copied (16, 8, 4, 2, or 1 bytes).
+  //
+  // \param InReg the register holding the initial value.
+  // \param OutReg the register to recieve the subtracted value.
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    TargetEmitter->emitSubImm(BB, MI, dl, InReg, OutReg);
+  }
+
+  // Emit a branch based on a condition code of not equal.
+  //
+  // \param TargetBB the destination of the branch.
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    TargetEmitter->emitBranchNE(BB, MI, dl, TargetBB);
+  }
+
+  // Return the register class used by the subtarget.
+  //
+  // \returns the target register class.
+  const TargetRegisterClass *getTRC() const { return TargetEmitter->getTRC(); }
+
+private:
+  const unsigned UnitSize;
+  OwningPtr<TargetStructByvalEmitter> TargetEmitter;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo &MRI;
+  const DataLayout *DL;
+
+  const TargetRegisterClass *VecTRC;
+  const unsigned VecLdOpc;
+  const unsigned VecStOpc;
+};
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
@@ -7261,23 +7684,13 @@ EmitStructByval(MachineInstr *MI, Machin
   unsigned Align = MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool isThumb2 = Subtarget->isThumb2();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned ldrOpc, strOpc, UnitSize = 0;
-
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  const TargetRegisterClass *TRC_Vec = 0;
+  unsigned UnitSize = 0;
 
   if (Align & 1) {
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     UnitSize = 1;
   } else if (Align & 2) {
-    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
-    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
@@ -7285,27 +7698,18 @@ EmitStructByval(MachineInstr *MI, Machin
           hasAttribute(AttributeSet::FunctionIndex,
                        Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16) {
-        ldrOpc = ARM::VLD1q32wb_fixed;
-        strOpc = ARM::VST1q32wb_fixed;
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
-      }
-      else if ((Align % 8 == 0) && SizeVal >= 8) {
-        ldrOpc = ARM::VLD1d32wb_fixed;
-        strOpc = ARM::VST1d32wb_fixed;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
-      }
     }
     // Can't use NEON instructions.
-    if (UnitSize == 0) {
-      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+    if (UnitSize == 0)
       UnitSize = 4;
-    }
   }
 
+  StructByvalEmitter ByvalEmitter(UnitSize, Subtarget, TII, MRI,
+                                  getDataLayout());
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
@@ -7316,67 +7720,22 @@ EmitStructByval(MachineInstr *MI, Machin
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (UnitSize >= 8) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(destIn).addImm(0).addReg(scratch));
-      } else if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(UnitSize));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
-          .addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(UnitSize));
-      }
-      srcIn = srcOut;
-      destIn = destOut;
+      std::pair<unsigned, unsigned> res =
+          ByvalEmitter.emitUnitLoad(BB, MI, dl, srcIn);
+      unsigned scratch = res.first;
+      srcIn = res.second;
+      destIn = ByvalEmitter.emitUnitStore(BB, MI, dl, destIn, scratch);
     }
 
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned scratch = MRI.createVirtualRegister(TRC);
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(1));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn)
-          .addReg(0).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      }
-      srcIn = srcOut;
-      destIn = destOut;
+      std::pair<unsigned, unsigned> res =
+          ByvalEmitter.emitByteLoad(BB, MI, dl, srcIn);
+      unsigned scratch = res.first;
+      srcIn = res.second;
+      destIn = ByvalEmitter.emitByteStore(BB, MI, dl, destIn, scratch);
     }
     MI->eraseFromParent();   // The instruction is gone now.
     return BB;
@@ -7414,34 +7773,7 @@ EmitStructByval(MachineInstr *MI, Machin
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Load an immediate to varEnd.
-  unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (isThumb2) {
-    unsigned VReg1 = varEnd;
-    if ((LoopSize & 0xFFFF0000) != 0)
-      VReg1 = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                   .addImm(LoopSize & 0xFFFF));
-
-    if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                     .addReg(VReg1)
-                     .addImm(LoopSize >> 16));
-  } else {
-    MachineConstantPool *ConstantPool = MF->getConstantPool();
-    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
-    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
-
-    // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
-    if (Align == 0)
-      Align = getDataLayout()->getTypeAllocSize(C->getType());
-    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
-                   .addReg(varEnd, RegState::Define)
-                   .addConstantPoolIndex(Idx)
-                   .addImm(0));
-  }
+  unsigned varEnd = ByvalEmitter.emitConstantLoad(BB, MI, dl, LoopSize);
   BB->addSuccessor(loopMBB);
 
   // Generate the loop body:
@@ -7450,12 +7782,12 @@ EmitStructByval(MachineInstr *MI, Machin
   //   destPhi = PHI(destLoop, dst)
   MachineBasicBlock *entryBB = BB;
   BB = loopMBB;
-  unsigned varLoop = MRI.createVirtualRegister(TRC);
-  unsigned varPhi = MRI.createVirtualRegister(TRC);
-  unsigned srcLoop = MRI.createVirtualRegister(TRC);
-  unsigned srcPhi = MRI.createVirtualRegister(TRC);
-  unsigned destLoop = MRI.createVirtualRegister(TRC);
-  unsigned destPhi = MRI.createVirtualRegister(TRC);
+  unsigned varLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned varPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned srcLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned srcPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned destLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned destPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
 
   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
     .addReg(varLoop).addMBB(loopMBB)
@@ -7469,39 +7801,16 @@ EmitStructByval(MachineInstr *MI, Machin
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-  if (UnitSize >= 8) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(destPhi).addImm(0).addReg(scratch));
-  } else if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addImm(UnitSize));
-  } else {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
-      .addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addReg(0).addImm(UnitSize));
+  {
+    std::pair<unsigned, unsigned> res =
+        ByvalEmitter.emitUnitLoad(BB, BB->end(), dl, srcPhi, srcLoop);
+    unsigned scratch = res.first;
+    ByvalEmitter.emitUnitStore(BB, BB->end(), dl, destPhi, scratch, destLoop);
   }
 
   // Decrement loop variable by UnitSize.
-  MachineInstrBuilder MIB = BuildMI(BB, dl,
-    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
-  MIB->getOperand(5).setReg(ARM::CPSR);
-  MIB->getOperand(5).setIsDef(true);
-
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  ByvalEmitter.emitSubImm(BB, BB->end(), dl, varPhi, varLoop);
+  ByvalEmitter.emitBranchNE(BB, BB->end(), dl, loopMBB);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
@@ -7510,36 +7819,17 @@ EmitStructByval(MachineInstr *MI, Machin
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   MachineInstr *StartOfExit = exitMBB->begin();
-  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned scratch = MRI.createVirtualRegister(TRC);
-    unsigned srcOut = MRI.createVirtualRegister(TRC);
-    unsigned destOut = MRI.createVirtualRegister(TRC);
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addImm(1));
-    } else {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addReg(0).addImm(1));
-    }
-    srcIn = srcOut;
-    destIn = destOut;
+    std::pair<unsigned, unsigned> res =
+        ByvalEmitter.emitByteLoad(BB, StartOfExit, dl, srcIn);
+    unsigned scratch = res.first;
+    srcIn = res.second;
+    destIn = ByvalEmitter.emitByteStore(BB, StartOfExit, dl, destIn, scratch);
   }
 
   MI->eraseFromParent();   // The instruction is gone now.