[llvm] r205090 - ARM64: initial backend import

Tim Northover tnorthover at apple.com
Sat Mar 29 03:18:15 PDT 2014


Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp Sat Mar 29 05:18:08 2014
@@ -0,0 +1,563 @@
+//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class ARM64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+
+  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
+
+  ~ARM64MCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getAMIndexed8OpValue - Return encoding info for base register
+  /// and 12-bit unsigned immediate attached to a load, store or prfm
+  /// instruction. If operand requires a relocation, record it and
+  /// return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  assert(0 && "Unable to encode MCOperand!");
+  return 0;
+}
+
+template <uint32_t FixupKind>
+uint32_t
+ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
+  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
+
+  const MCOperand &MO = MI.getOperand(OpIdx + 1);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return BaseReg | (ImmVal << 5);
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+  assert(ShiftVal == 0 && "shift not allowed on add/sub immediate with fixup");
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  assert(false && "Invalid value for vector shift amount!");
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t
+ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "ARM64GenMCCodeEmitter.inc"

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp Sat Mar 29 05:18:08 2014
@@ -0,0 +1,168 @@
+//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+#include "ARM64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) ARM64MCExpr(Expr, Kind);
+}
+
+StringRef ARM64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
+
+const MCSection *ARM64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h Sat Mar 29 05:18:08 2014
@@ -0,0 +1,162 @@
+//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes ARM64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM64MCEXPR_H
+#define LLVM_ARM64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class ARM64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_G0       = 0x030,
+    VK_G1       = 0x040,
+    VK_G2       = 0x050,
+    VK_G3       = 0x060,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void PrintImpl(raw_ostream &OS) const;
+
+  void AddValueSymbols(MCAssembler *) const;
+
+  const MCSection *FindAssociatedSection() const;
+
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const ARM64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp Sat Mar 29 05:18:08 2014
@@ -0,0 +1,167 @@
+//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCTargetDesc.h"
+#include "ARM64ELFStreamer.h"
+#include "ARM64MCAsmInfo.h"
+#include "InstPrinter/ARM64InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARM64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "ARM64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createARM64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARM64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARM64MCRegisterInfo(X, ARM64::LR);
+  return X;
+}
+
+static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new ARM64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new ARM64MCAsmInfoELF();
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                        CodeModel::Model CM,
+                                        CodeGenOpt::Level OL) {
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Only small and large code models are allowed on ARM64");
+
+  // ARM64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARM64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
+
+  return 0;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
+
+  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARM64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheARM64Target, createARM64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
+                                        createARM64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createARM64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARM64Target, createARM64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
+                                          createARM64MCSubtargetInfo);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createARM64AsmBackend);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
+                                        createARM64MCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
+                                        createARM64MCInstPrinter);
+}

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h Sat Mar 29 05:18:08 2014
@@ -0,0 +1,62 @@
+//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64MCTARGETDESC_H
+#define ARM64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheARM64Target;
+
+MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    StringRef TT, StringRef CPU);
+
+MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
+
+MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for ARM64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARM64GenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARM64GenSubtargetInfo.inc"
+
+#endif

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp Sat Mar 29 05:18:08 2014
@@ -0,0 +1,396 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue);
+};
+}
+
+bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      assert(0 && "Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void ARM64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // ARM64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21 ||
+      Kind == ARM64::fixup_arm64_pcrel_imm19)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == ARM64::fixup_arm64_pcrel_imm19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo at got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo at got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // ARM64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceeding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (A_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &A_SD, Layout)) -
+             (A_Base == NULL || A_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (B_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &B_SD, Layout)) -
+             (B_Base == NULL || B_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = 0;
+    }
+
+    // ARM64 uses external relocations as much as possible. For debug sections,
+    // and for pointer-sized relocations (.quad), we allow section relocations.
+    // It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1 << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt Sat Mar 29 05:18:08 2014
@@ -0,0 +1,14 @@
+add_llvm_library(LLVMARM64Desc
+  ARM64AsmBackend.cpp
+  ARM64ELFObjectWriter.cpp
+  ARM64ELFStreamer.cpp
+  ARM64MCAsmInfo.cpp
+  ARM64MCCodeEmitter.cpp
+  ARM64MCExpr.cpp
+  ARM64MCTargetDesc.cpp
+  ARM64MachObjectWriter.cpp
+)
+add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt Sat Mar 29 05:18:08 2014
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Desc
+parent = ARM64
+required_libraries = ARM64AsmPrinter ARM64Info MC Support
+add_to_library_groups = ARM64
+

Added: llvm/trunk/lib/Target/ARM64/MCTargetDesc/Makefile
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/MCTargetDesc/Makefile?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/MCTargetDesc/Makefile (added)
+++ llvm/trunk/lib/Target/ARM64/MCTargetDesc/Makefile Sat Mar 29 05:18:08 2014
@@ -0,0 +1,16 @@
+##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

Added: llvm/trunk/lib/Target/ARM64/Makefile
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/Makefile?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/Makefile (added)
+++ llvm/trunk/lib/Target/ARM64/Makefile Sat Mar 29 05:18:08 2014
@@ -0,0 +1,25 @@
+##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARM64CodeGen
+TARGET = ARM64
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
+		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
+		ARM64GenDAGISel.inc \
+		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
+		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
+		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
+		ARM64GenMCPseudoLowering.inc
+
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc
+
+include $(LEVEL)/Makefile.common

Added: llvm/trunk/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp (added)
+++ llvm/trunk/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target TheARM64Target;
+} // end namespace llvm
+
+extern "C" void LLVMInitializeARM64TargetInfo() {
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64Target, "arm64",
+                                                   "ARM64");
+}

Added: llvm/trunk/lib/Target/ARM64/TargetInfo/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/TargetInfo/CMakeLists.txt?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/TargetInfo/CMakeLists.txt (added)
+++ llvm/trunk/lib/Target/ARM64/TargetInfo/CMakeLists.txt Sat Mar 29 05:18:08 2014
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64Info
+  ARM64TargetInfo.cpp
+  )
+
+add_dependencies(LLVMARM64Info ARM64CommonTableGen)

Added: llvm/trunk/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/TargetInfo/LLVMBuild.txt?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/TargetInfo/LLVMBuild.txt (added)
+++ llvm/trunk/lib/Target/ARM64/TargetInfo/LLVMBuild.txt Sat Mar 29 05:18:08 2014
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Info
+parent = ARM64
+required_libraries = MC Support
+add_to_library_groups = ARM64
+

Added: llvm/trunk/lib/Target/ARM64/TargetInfo/Makefile
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/TargetInfo/Makefile?rev=205090&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM64/TargetInfo/Makefile (added)
+++ llvm/trunk/lib/Target/ARM64/TargetInfo/Makefile Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

Modified: llvm/trunk/lib/Target/LLVMBuild.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/LLVMBuild.txt?rev=205090&r1=205089&r2=205090&view=diff
==============================================================================
--- llvm/trunk/lib/Target/LLVMBuild.txt (original)
+++ llvm/trunk/lib/Target/LLVMBuild.txt Sat Mar 29 05:18:08 2014
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = AArch64 ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=205090&r1=205089&r2=205090&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Sat Mar 29 05:18:08 2014
@@ -654,7 +654,9 @@ Instruction *InstCombiner::visitCallInst
   }
 
   case Intrinsic::arm_neon_vmulls:
-  case Intrinsic::arm_neon_vmullu: {
+  case Intrinsic::arm_neon_vmullu:
+  case Intrinsic::arm64_neon_smull:
+  case Intrinsic::arm64_neon_umull: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
 
@@ -664,7 +666,8 @@ Instruction *InstCombiner::visitCallInst
     }
 
     // Check for constant LHS & RHS - in this case we just simplify.
-    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu);
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
+                 II->getIntrinsicID() == Intrinsic::arm64_neon_umull);
     VectorType *NewVT = cast<VectorType>(II->getType());
     if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
       if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {

Added: llvm/trunk/test/Analysis/CostModel/ARM64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/ARM64/lit.local.cfg?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/ARM64/lit.local.cfg (added)
+++ llvm/trunk/test/Analysis/CostModel/ARM64/lit.local.cfg Sat Mar 29 05:18:08 2014
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True

Added: llvm/trunk/test/Analysis/CostModel/ARM64/select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/ARM64/select.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/ARM64/select.ll (added)
+++ llvm/trunk/test/Analysis/CostModel/ARM64/select.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; CHECK-LABEL: select
+define void @select() {
+    ; Scalar values
+  ; CHECK: cost of 1 {{.*}} select
+  %v1 = select i1 undef, i8 undef, i8 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v2 = select i1 undef, i16 undef, i16 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v3 = select i1 undef, i32 undef, i32 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v4 = select i1 undef, i64 undef, i64 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v5 = select i1 undef, float undef, float undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v6 = select i1 undef, double undef, double undef
+
+  ; Vector values - check for vectors that have a high cost because they end up
+  ; scalarized.
+  ; CHECK: cost of 320 {{.*}} select
+  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
+
+  ; CHECK: cost of 160 {{.*}} select
+  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
+
+  ; CHECK: cost of 80 {{.*}} select
+  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+  ; CHECK: cost of 160 {{.*}} select
+  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+
+    ret void
+}

Added: llvm/trunk/test/Analysis/CostModel/ARM64/store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/ARM64/store.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/ARM64/store.ll (added)
+++ llvm/trunk/test/Analysis/CostModel/ARM64/store.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,22 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+; CHECK-LABEL: store
+define void @store() {
+    ; Stores of <2 x i64> should be expensive because we don't split them and
+    ; and unaligned 16b stores have bad performance.
+    ; CHECK: cost of 12 {{.*}} store
+    store <2 x i64> undef, <2 x i64> * undef
+
+    ; We scalarize the loads/stores because there is no vector register name for
+    ; these types (they get extended to v.4h/v.2s).
+    ; CHECK: cost of 16 {{.*}} store
+    store <2 x i8> undef, <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} store
+    store <4 x i8> undef, <4 x i8> * undef
+    ; CHECK: cost of 16 {{.*}} load
+    load <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} load
+    load <4 x i8> * undef
+
+    ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+  br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i:                                            ; preds = %entry
+  br label %bb3.i
+
+bb3.i:                                            ; preds = %bb2.i, %entry
+  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69:                                           ; preds = %bb3.i
+  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+  %0 = select i1 undef, float 0.000000e+00, float undef
+  %1 = fdiv float %0, undef
+  %2 = fcmp ult float %1, 0xBF847AE140000000
+  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+  store float %storemerge9, float* undef, align 4
+  br i1 undef, label %bb42, label %bb47
+
+bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br i1 undef, label %bb46, label %bb53
+
+bb46:                                             ; preds = %bb42
+  br label %bb48
+
+bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br label %bb48
+
+bb48:                                             ; preds = %bb47, %bb46
+  br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13:                                           ; preds = %bb48
+  br label %bb1.i14
+
+bb1.i14:                                          ; preds = %bb.i13, %bb48
+  br label %bb53
+
+bb53:                                             ; preds = %bb1.i14, %bb42
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+  %do_tab_convert = alloca i32, align 4
+  br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24:                                        ; preds = %entry
+  unreachable
+
+if.else295:                                       ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+  unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}

Added: llvm/trunk/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+;   The stack frame store is not 64-bit aligned. Make sure we use an
+;   instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+  %a = alloca [49 x i32], align 4
+  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+  %p = bitcast i32* %p32 to i64*
+  store i64 %val, i64* %p, align 8
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+  %cmp = icmp eq i32* null, undef
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* undef, align 1
+  %tmp4 = load i8* undef, align 1
+  %tobool = trunc i8 %tmp4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14:                                  ; preds = %if.end
+  unreachable
+
+if.end33:                                         ; preds = %if.end
+  unreachable
+}

Added: llvm/trunk/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+ at test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK:                      x[[REG]], #4096
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 12
+  %add = add nsw i64 %0, 34628173824
+  %1 = inttoptr i64 %add to i32*
+  %2 = load volatile i32* %1, align 4096
+  store volatile i32 %2, i32* @test_data, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes.  This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  br i1 undef, label %return, label %if.end
+
+if.end:
+  %tmp.i = load i64* undef, align 8
+  %and.i.i.i = and i64 %tmp.i, -16
+  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+  br i1 undef, label %if.end32, label %return
+
+if.end32:
+  %0 = icmp ult i32 undef, 3
+  %1 = zext i64 %tmp.i to i320
+  %.pn.v = select i1 %0, i320 128, i320 64
+  %.pn = shl i320 %1, %.pn.v
+  %ins346392 = or i320 %.pn, 0
+  store i320 %ins346392, i320* undef, align 8
+  br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+  unreachable
+
+exit:
+  unreachable
+
+return:
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+  %b = bitcast <4 x i32> %a to i128
+  %c = trunc i128 %b to i32
+  ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+  %b = bitcast <2 x i64> %a to i128
+  %c = trunc i128 %b to i64
+  ret i64 %c
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+ at b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b at PAGE|.Lb}}
+; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b at PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str  [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str  [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK:             adrp    x{{[0-9]+}}, _x at GOTPAGE
+; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x at GOTPAGEOFF]
+; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
+  unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]

Added: llvm/trunk/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+;   %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}

Added: llvm/trunk/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-06-06-FPToUI.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-06-06-FPToUI.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-06-06-FPToUI.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,65 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+ at .str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+ at .str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+ at .str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+ at .str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK:  fcvtzu x{{.}}, d{{.}}
+; CHECK:  fcvtzu w{{.}}, d{{.}}
+entry:
+  %d.addr = alloca double, align 8
+  store double %d, double* %d.addr, align 8
+  %0 = load double* %d.addr, align 8
+  %1 = load double* %d.addr, align 8
+  %conv = fptoui double %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %2 = load double* %d.addr, align 8
+  %3 = load double* %d.addr, align 8
+  %conv1 = fptoui double %3 to i32
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK:  fcvtzu x{{.}}, s{{.}}
+; CHECK:  fcvtzu w{{.}}, s{{.}}
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  %0 = load float* %f.addr, align 4
+  %conv = fpext float %0 to double
+  %1 = load float* %f.addr, align 4
+  %conv1 = fptoui float %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %2 = load float* %f.addr, align 4
+  %conv2 = fpext float %2 to double
+  %3 = load float* %f.addr, align 4
+  %conv3 = fptoui float %3 to i32
+  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @testDouble(double 1.159198e+01)
+  call void @testFloat(float 0x40272F1800000000)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}

Added: llvm/trunk/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+ at shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+  br i1 undef, label %if.end56, label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %for.cond10, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  unreachable
+
+for.cond10:                                       ; preds = %for.cond
+  br i1 undef, label %if.end56, label %for.body14
+
+for.body14:                                       ; preds = %for.cond10
+  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+  %sext = shl i64 %call22, 32
+  %conv30 = ashr exact i64 %sext, 32
+  %add29 = sub i64 0, %conv30
+  %sub = add i64 %add29, 0
+  %add31 = shl i64 %sub, 32
+  %sext59 = add i64 %add31, 4294967296
+  %conv33 = ashr exact i64 %sext59, 32
+  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+  br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43:                                      ; preds = %for.body14
+  unreachable
+
+cond.false45:                                     ; preds = %for.body14
+  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+  unreachable
+
+if.end56:                                         ; preds = %for.cond10, %entry
+  ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
+  %A = fcmp olt <4 x float> %N0, %N1
+  %B = zext <4 x i1> %A to <4 x i32>
+  ret <4 x i32> %B
+}

Added: llvm/trunk/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2013-01-23-frem-crash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2013-01-23-frem-crash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2013-01-23-frem-crash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+  %B26 = frem float 0.000000e+00, undef
+  br i1 undef, label %CF, label %CF77
+
+CF:                                               ; preds = %CF, %CF76
+  store float %B26, float* undef
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2013-01-23-sext-crash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2013-01-23-sext-crash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2013-01-23-sext-crash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %value
+}
+
+

Added: llvm/trunk/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/AdvSIMD-Scalar.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/AdvSIMD-Scalar.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/AdvSIMD-Scalar.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
+;
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
+; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+  %add = add <2 x i64> %a, %b
+  %vgetq_lane = extractelement <2 x i64> %add, i32 0
+  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+  %add3 = add i64 %vgetq_lane, %vgetq_lane2
+  %sub = sub i64 %vgetq_lane, %vgetq_lane2
+  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+  ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %sub.i = sub nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %add.i to double
+  ret double %retval
+}

Added: llvm/trunk/test/CodeGen/ARM64/aapcs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/aapcs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/aapcs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/aapcs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+ at var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+  store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+  ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+ at var64 = global i64 0, align 8
+
+  ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+                                i32 %int, i64 %long) {
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = zext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = sext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64
+; CHECK: uxtw [[EXT:x[0-9]+]], x3
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+  ; Under AAPCS variadic functions have the same calling convention as
+  ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr x1, xzr, #0x1
+; CHECK: bl variadic
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/abi-varargs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/abi-varargs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/abi-varargs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/abi-varargs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %args = alloca i8*, align 8
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 %a1, i32* %1, align 4
+  store i32 %a2, i32* %2, align 4
+  store i32 %a3, i32* %3, align 4
+  store i32 %a4, i32* %4, align 4
+  store i32 %a5, i32* %5, align 4
+  store i32 %a6, i32* %6, align 4
+  store i32 %a7, i32* %7, align 4
+  store i32 %a8, i32* %8, align 4
+  store i32 %a9, i32* %9, align 4
+  %10 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %10)
+  %11 = va_arg i8** %args, i32
+  store i32 %11, i32* %a10, align 4
+  %12 = va_arg i8** %args, i32
+  store i32 %12, i32* %a11, align 4
+  %13 = va_arg i8** %args, i32
+  store i32 %13, i32* %a12, align 4
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %a3 = alloca i32, align 4
+  %a4 = alloca i32, align 4
+  %a5 = alloca i32, align 4
+  %a6 = alloca i32, align 4
+  %a7 = alloca i32, align 4
+  %a8 = alloca i32, align 4
+  %a9 = alloca i32, align 4
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 1, i32* %a1, align 4
+  store i32 2, i32* %a2, align 4
+  store i32 3, i32* %a3, align 4
+  store i32 4, i32* %a4, align 4
+  store i32 5, i32* %a5, align 4
+  store i32 6, i32* %a6, align 4
+  store i32 7, i32* %a7, align 4
+  store i32 8, i32* %a8, align 4
+  store i32 9, i32* %a9, align 4
+  store i32 10, i32* %a10, align 4
+  store i32 11, i32* %a11, align 4
+  store i32 12, i32* %a12, align 4
+  %1 = load i32* %a1, align 4
+  %2 = load i32* %a2, align 4
+  %3 = load i32* %a3, align 4
+  %4 = load i32* %a4, align 4
+  %5 = load i32* %a5, align 4
+  %6 = load i32* %a6, align 4
+  %7 = load i32* %a7, align 4
+  %8 = load i32* %a8, align 4
+  %9 = load i32* %a9, align 4
+  %10 = load i32* %a10, align 4
+  %11 = load i32* %a11, align 4
+  %12 = load i32* %a12, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  ret i32 0
+}
+
+;rdar://13668483
+ at .str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %vv, align 16
+  ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca <4 x i32>, align 16
+  store i32 %x, i32* %x.addr, align 4
+  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+  %0 = load i32* %x.addr, align 4
+  %1 = load <4 x i32>* %y.addr, align 16
+  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vs = alloca %struct.s41, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %ap.cur = load i8** %args
+  %1 = getelementptr i8* %ap.cur, i32 15
+  %2 = ptrtoint i8* %1 to i64
+  %3 = and i64 %2, -16
+  %ap.align = inttoptr i64 %3 to i8*
+  %ap.next = getelementptr i8* %ap.align, i32 16
+  store i8* %ap.next, i8** %args
+  %4 = bitcast i8* %ap.align to %struct.s41*
+  %5 = bitcast %struct.s41* %vs to i8*
+  %6 = bitcast %struct.s41* %4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+  ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %s41 = alloca %struct.s41, align 16
+  store i32 %x, i32* %x.addr, align 4
+  %0 = bitcast %struct.s41* %s41 to i128*
+  store i128 %s41.coerce, i128* %0, align 1
+  %1 = load i32* %x.addr, align 4
+  %2 = bitcast %struct.s41* %s41 to i128*
+  %3 = load i128* %2, align 1
+  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/abi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/abi.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/abi.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/abi.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb  {{w[0-9]+}}, [sp]
+  %conv = sext i8 %a4 to i64
+  %conv3 = sext i16 %a5 to i64
+  %conv8 = sext i8 %b1 to i64
+  %conv9 = sext i16 %b2 to i64
+  %conv11 = sext i8 %b3 to i64
+  %conv13 = sext i8 %b4 to i64
+  %add10 = add i64 %a2, %a1
+  %add12 = add i64 %add10, %a3
+  %add14 = add i64 %add12, %conv
+  %add = add i64 %add14, %conv3
+  %add1 = add i64 %add, %a6
+  %add2 = add i64 %add1, %a7
+  %add4 = add i64 %add2, %a8
+  %add5 = add i64 %add4, %conv8
+  %add6 = add i64 %add5, %conv9
+  %add7 = add i64 %add6, %conv11
+  %add15 = add i64 %add7, %conv13
+  %sext = shl i64 %add15, 32
+  %conv17 = ashr exact i64 %sext, 32
+  ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+  %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+  ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+  %0 = load <4 x i32>* %in, align 16
+  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+ at g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+  %conv = sitofp i32 %i to float
+  %add = fadd float %conv, %f1
+  %conv1 = fpext float %add to double
+  %add2 = fadd double %conv1, %d7
+  %add3 = fadd double %add2, %d8
+  store double %add3, double* @g_d, align 8
+  ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+  %conv = sitofp i32 %i1 to float
+  %add = fadd float %conv, %d1
+  %conv1 = fpext float %add to double
+  %conv2 = sitofp i32 %i8 to double
+  %add3 = fadd double %conv2, %conv1
+  %conv4 = sitofp i32 %i9 to double
+  %add5 = fadd double %conv4, %add3
+  store double %add5, double* @g_d, align 8
+  ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+  %0 = load <2 x i32>* %in, align 8
+  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+  %0 = load double* %in, align 8
+  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+          double %0, double %0, double %0, double %0, double %0,
+          float 3.000000e+00, double %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+               double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+  %0 = load i64* %in, align 8
+  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+  ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+             i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load float* %in, align 4
+  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+          i8 signext 3)
+  ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+                      float, float, float, float, float, float, float, float,
+                      i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load i32* %in, align 4
+  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+  ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+             i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
+                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
+  ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+  %v = zext i1 %j to i32
+  ret i32 %v
+}

Added: llvm/trunk/test/CodeGen/ARM64/abi_align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/abi_align.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/abi_align.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/abi_align.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,529 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+ at g38 = common global %struct.s38 zeroinitializer, align 4
+ at g38_2 = common global %struct.s38 zeroinitializer, align 4
+ at g39 = common global %struct.s39 zeroinitializer, align 16
+ at g39_2 = common global %struct.s39 zeroinitializer, align 16
+ at g40 = common global %struct.s40 zeroinitializer, align 4
+ at g40_2 = common global %struct.s40 zeroinitializer, align 4
+ at g41 = common global %struct.s41 zeroinitializer, align 16
+ at g41_2 = common global %struct.s41 zeroinitializer, align 16
+ at g42 = common global %struct.s42 zeroinitializer, align 4
+ at g42_2 = common global %struct.s42 zeroinitializer, align 4
+ at g43 = common global %struct.s43 zeroinitializer, align 16
+ at g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+                       %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: sub x[[B:[0-9]+]], fp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov fp, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+                       %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], fp, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [fp, #-32]
+; FAST: stur {{x[0-9]+}}, [fp, #-24]
+; FAST: stur {{x[0-9]+}}, [fp, #-16]
+; FAST: stur {{x[0-9]+}}, [fp, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; FAST: stp {{x[0-9]+}}, {{x[0-9]+}}, [x[[ADDR]]]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                           i32 6, i32 7, i128 %0, i32 8) #5
+  ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                    i32 6, i32 7, i64 %0, i32 8) #5
+  ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}

Added: llvm/trunk/test/CodeGen/ARM64/addp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/addp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/addp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/addp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x double> %a, i32 0
+  %lane1.i = extractelement <2 x double> %a, i32 1
+  %vpaddd.i = fadd double %lane0.i, %lane1.i
+  ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x i64> %a, i32 0
+  %lane1.i = extractelement <2 x i64> %a, i32 1
+  %vpaddd.i = add i64 %lane0.i, %lane1.i
+  ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x float> %a, i32 0
+  %lane1.i = extractelement <2 x float> %a, i32 1
+  %vpaddd.i = fadd float %lane0.i, %lane1.i
+  ret float %vpaddd.i
+}

Added: llvm/trunk/test/CodeGen/ARM64/addr-mode-folding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/addr-mode-folding.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/addr-mode-folding.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/addr-mode-folding.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+ at block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext accross uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}

Added: llvm/trunk/test/CodeGen/ARM64/addr-type-promotion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/addr-type-promotion.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/addr-type-promotion.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/addr-type-promotion.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+ at block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block at GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block at GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  x0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], x1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %tmp = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+  %tmp1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+  %tmp2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %tmp1, %tmp2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %tmp1, %tmp2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc10 = add nsw i32 %i2, 1
+  %idxprom11 = sext i32 %inc to i64
+  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+  %tmp3 = load i8* %arrayidx12, align 1
+  %idxprom13 = sext i32 %inc10 to i64
+  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+  %tmp4 = load i8* %arrayidx14, align 1
+  %cmp17 = icmp eq i8 %tmp3, %tmp4
+  br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19:                                        ; preds = %if.end
+  %cmp22 = icmp ugt i8 %tmp3, %tmp4
+  %conv24 = zext i1 %cmp22 to i8
+  br label %return
+
+if.end25:                                         ; preds = %if.end
+  %inc26 = add nsw i32 %i1, 2
+  %inc27 = add nsw i32 %i2, 2
+  %idxprom28 = sext i32 %inc26 to i64
+  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+  %tmp5 = load i8* %arrayidx29, align 1
+  %idxprom30 = sext i32 %inc27 to i64
+  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+  %tmp6 = load i8* %arrayidx31, align 1
+  %cmp34 = icmp eq i8 %tmp5, %tmp6
+  br i1 %cmp34, label %return, label %if.then36
+
+if.then36:                                        ; preds = %if.end25
+  %cmp39 = icmp ugt i8 %tmp5, %tmp6
+  %conv41 = zext i1 %cmp39 to i8
+  br label %return
+
+return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+  ret i8 %retval.0
+}

Added: llvm/trunk/test/CodeGen/ARM64/addrmode.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/addrmode.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/addrmode.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/addrmode.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+ at object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+  %tmp1 = getelementptr inbounds i64* @object, i64 %a
+  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/alloc-no-stack-realign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/alloc-no-stack-realign.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/alloc-no-stack-realign.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/alloc-no-stack-realign.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+ at T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %arr = alloca [32 x i32], align 4
+  %i = alloca i32, align 4
+  %arr2 = alloca [32 x i32], align 4
+  %j = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  %v = mul i64 4, %tmp1
+  %vla = alloca i8, i64 %v, align 4
+  %tmp2 = bitcast i8* %vla to i32*
+  %tmp3 = load i32* %a.addr, align 4
+  store i32 %tmp3, i32* %i, align 4
+  %tmp4 = load i32* %a.addr, align 4
+  store i32 %tmp4, i32* %j, align 4
+  %tmp5 = load i32* %j, align 4
+  store i32 %tmp5, i32* %retval
+  %x = load i32* %retval
+  ret i32 %x
+}

Added: llvm/trunk/test/CodeGen/ARM64/andCmpBrToTBZ.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/andCmpBrToTBZ.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/andCmpBrToTBZ.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/andCmpBrToTBZ.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+  %tobool = icmp eq i8* %str14, null
+  br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end:                                           ; preds = %entry
+  %and.i.i.i = and i32 %int1, 4
+  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %if.end
+  %cmp.i.i.i = icmp eq i8* %str12, %str13
+  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
+  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+  %tmp11 = load i8* %str14, align 8
+  %tmp12 = and i8 %tmp11, 2
+  %tmp13 = icmp ne i8 %tmp12, 0
+  br label %return
+
+if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19:                                     ; preds = %if.end5
+  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
+  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+  br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9:                                         ; preds = %if.then7
+  %tmp23 = load i8* %str5, align 8
+  %tmp24 = and i8 %tmp23, 2
+  %tmp25 = icmp ne i8 %tmp24, 0
+  br label %return
+
+if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+  %lnot = xor i1 %IsEditable, true
+  br label %return
+
+return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
+  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }

Added: llvm/trunk/test/CodeGen/ARM64/anyregcc-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/anyregcc-crash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/anyregcc-crash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/anyregcc-crash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

Added: llvm/trunk/test/CodeGen/ARM64/anyregcc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/anyregcc.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/anyregcc.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/anyregcc.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,358 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .long   0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; CHECK-NEXT:   .long _test
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _property_access1
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _property_access2
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _property_access3
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _anyreg_test1
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _anyreg_test2
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _patchpoint_spilldef
+; CHECK-NEXT:   .long 112
+; CHECK-NEXT:   .long _patchpoint_spillargs
+; CHECK-NEXT:   .long 128
+; Num Constants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   8
+
+; test
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+; CHECK-NEXT:   .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

Added: llvm/trunk/test/CodeGen/ARM64/arith-saturating.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/arith-saturating.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/arith-saturating.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/arith-saturating.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/arith.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/arith.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/arith.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = udiv i32 %a, %b
+  ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = udiv i64 %a, %b
+  ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = sdiv i32 %a, %b
+  ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = sdiv i64 %a, %b
+  ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lslv w0, w0, w1
+; CHECK: ret
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lslv x0, x0, x1
+; CHECK: ret
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsrv w0, w0, w1
+; CHECK: ret
+  %lshr = lshr i32 %a, %b
+  ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsrv x0, x0, x1
+; CHECK: ret
+  %lshr = lshr i64 %a, %b
+  ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asrv w0, w0, w1
+; CHECK: ret
+  %ashr = ashr i32 %a, %b
+  ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asrv x0, x0, x1
+; CHECK: ret
+  %ashr = ashr i64 %a, %b
+  ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add	w0, w1, w0, sxth
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %e = add i32 %x, %c
+  ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add	w0, w1, w0, sxth #2
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %d = shl i32 %c, 2
+  %e = add i32 %x, %d
+  ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: ret
+  %c = zext i16 %a to i64
+  %d = shl i64 %c, 3
+  %e = add i64 %x, %d
+  ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+  %b = and i64 %a, 4294967295
+  %c = add i64 %x, %b
+  ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+  %a = shl i64 %x, 1
+  ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], x0
+; CHECK: sub x0, xzr, [[REG]], lsl #32
+; CHECK: ret
+  %tmp16 = sext i16 %a to i64
+  %tmp17 = mul i64 %tmp16, -4294967296
+  ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  ret i64 %udiv
+}
+
+declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+  %inv = xor i32 %x, -1
+  ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+  %inv = xor i64 %x, -1
+  ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 9
+  ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+  %res = mul i64 %a, 15
+  ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+  %res = mul nsw i32 %a, 7
+  ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+  %res = mul nsw i64 %a, 17
+  ret i64 %res
+}

Added: llvm/trunk/test/CodeGen/ARM64/atomic-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/atomic-128.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/atomic-128.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/atomic-128.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,213 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s
+
+ at var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULTLO]], x2
+; CHECK: sbc    xzr, [[RESULTHI]], x3
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH_REGLO:x[0-9]+]], x2, [[DEST_REGLO]]
+; CHECK: bic    [[SCRATCH_REGHI:x[0-9]+]], x3, [[DEST_REGHI]]
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, lt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, lt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, gt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, gt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, cc
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, cc
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, hi
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, hi
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: orr [[SAMELO:x[0-9]+]], [[LO]], xzr
+; CHECK: orr [[SAMEHI:x[0-9]+]], [[HI]], xzr
+; CHECK: stxp [[SUCCESS:w[0-9]+]], [[SAMELO]], [[SAMEHI]], [x0]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/atomic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/atomic.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/atomic.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/atomic.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,343 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr    [[NEWVAL_REG:x[0-9]+]], xzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr [[NEWVAL_REG]], [[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:w[0-9]+]], [[OLDVAL_REG]], w[[DEST_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw nand i32* %p, i32 7 release
+  ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:x[0-9]+]], [[OLDVAL_REG]], [[DEST_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw nand i64* %p, i64 7 acq_rel
+  ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw or i32* %p, i32 5 seq_cst
+  ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], [[OLDVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw or i64* %p, i64 7 monotonic
+  ret i64 %val
+}
+
+define void @acquire_fence() {
+   fence acquire
+   ret void
+   ; CHECK-LABEL: acquire_fence:
+   ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+   fence release
+   ret void
+   ; CHECK-LABEL: release_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+   fence seq_cst
+   ret void
+   ; CHECK-LABEL: seq_cst_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+   %r = load atomic i32* %p seq_cst, align 4
+   ret i32 %r
+   ; CHECK-LABEL: atomic_load:
+   ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+  %tot1 = add i8 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+  %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  %val_random = load atomic i8* %ptr_random unordered, align 1
+  %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+  %tot1 = add i16 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+  %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  %val_random = load atomic i16* %ptr_random unordered, align 2
+  %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+  %tot1 = add i32 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+  %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  %val_random = load atomic i32* %ptr_random unordered, align 4
+  %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+  %tot1 = add i64 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+  %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  %val_random = load atomic i64* %ptr_random unordered, align 8
+  %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+   store atomic i32 4, i32* %p seq_cst, align 4
+   ret void
+   ; CHECK-LABEL: atomc_store:
+   ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+ at counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i = add i32 %0, 1
+  %tobool = icmp eq i32 %add.i, 0
+  br i1 %tobool, label %if.else, label %return
+
+if.else:                                          ; preds = %entry
+  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i2 = add i32 %1, 1
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+  ret i32 %retval.0
+}

Added: llvm/trunk/test/CodeGen/ARM64/big-imm-offsets.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/big-imm-offsets.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/big-imm-offsets.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/big-imm-offsets.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+  %a = ptrtoint i32* %p to i64
+  %ao = add i64 %a, 25769803792
+  %b = inttoptr i64 %ao to i32*
+  store volatile i32 0, i32* %b, align 4
+  store volatile i32 0, i32* %b, align 4
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/big-stack.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/big-stack.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/big-stack.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/big-stack.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #8192
+define void @foo() nounwind ssp {
+entry:
+  %buffer = alloca [33554432 x i8], align 1
+  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+  call void @doit(i8* %arraydecay) nounwind
+  ret void
+}
+
+declare void @doit(i8*)

Added: llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,406 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.X* %x to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+  %bf.clear = lshr i32 %tmp1, 3
+  %bf.clear.lobit = and i32 %bf.clear, 1
+  %frombool = trunc i32 %bf.clear.lobit to i8
+  store i8 %frombool, i8* %b, align 1
+  ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfm  w0, w0, #0, #3
+  %tmp = trunc i64 %cav1.coerce to i32
+  %tmp1 = shl i32 %tmp, 28
+  %bf.val.sext = ashr exact i32 %tmp1, 28
+  ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfm  w0, w0, #4, #9
+  %tmp = trunc i64 %cav1.coerce to i32
+  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+  ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.Z* %x to i64*
+  %tmp1 = load i64* %tmp, align 4
+  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+  %bf.clear = lshr i64 %tmp1, 3
+  %bf.clear.lobit = and i64 %bf.clear, 1
+  store i64 %bf.clear.lobit, i64* %b, align 8
+  ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfm  x0, x0, #0, #35
+  %tmp = shl i64 %cav1.coerce, 28
+  %bf.val.sext = ashr exact i64 %tmp, 28
+  ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfm  x0, x0, #4, #41
+  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+  ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #39
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -16777216
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 16777215
+  %or = or i64 %and, %and1
+  store i64 %or, i64* %y, align 8
+  ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  store i32 %or, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shr1 = lshr i32 %or, 2
+  store i32 %shr1, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shr1 = lshr i64 %or, 2
+  store i64 %shr1, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %and1 = and i32 %x, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %and1 = and i64 %x, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfm w0, w0, #11, #11
+  %and.i.i = and i32 %tmp2, 2048
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:w[0-9]+]], [[REG1]], #2, #29
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:x[0-9]+]], [[REG1]], #2, #61
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], w2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -256
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 255
+  %or = or i32 %and, %and1
+  %shl = lshr i32 %or, 4
+  %and2 = and i32 %shl, -8
+  %shr1 = lshr i32 %x1, 5
+  %and3 = and i32 %shr1, 7
+  %or1 = or i32 %and2, %and3
+  %shl1 = shl i32 %or1, 2
+  store i32 %shl1, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], x2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -256
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 255
+  %or = or i64 %and, %and1
+  %shl = lshr i64 %or, 4
+  %and2 = and i64 %shl, -8
+  %shr1 = lshr i64 %x1, 5
+  %and3 = and i64 %shr1, 7
+  %or1 = or i64 %and2, %and3
+  %shl1 = shl i64 %or1, 2
+  store i64 %shl1, i64* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:w[0-9]+]], [[REG2]], #2, #29
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, 1737056
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:x[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:x[0-9]+]], [[REG2]], #2, #61
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, 1737056
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfm x0, x0, #9, #16
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %result = and i64 %conv82, 255
+  ret i64 %result
+}

Added: llvm/trunk/test/CodeGen/ARM64/blockaddress.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/blockaddress.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/blockaddress.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/blockaddress.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1 at PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1 at PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+  %recover = alloca i64, align 8
+  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+  br label %mylabel
+
+mylabel:
+  %tmp = load volatile i64* %recover, align 8
+  ret i64 %tmp
+}

Added: llvm/trunk/test/CodeGen/ARM64/build-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/build-vector.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/build-vector.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/build-vector.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is prefered against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+  %conv = trunc i32 %skip0 to i8
+  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+  %tmp = bitcast i32* %out_int to <4 x i32>*
+  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+  ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float %b, i32 1
+  %3 = insertelement <4 x float> %2, float %c, i32 2
+  %4 = insertelement <4 x float> %3, float %d, i32 3
+  ret <4 x float> %4
+}

Added: llvm/trunk/test/CodeGen/ARM64/call-tailcalls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/call-tailcalls.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/call-tailcalls.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/call-tailcalls.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+ at t = weak global i32 ()* null
+ at x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t at GOTPAGE
+; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t at GOTPAGEOFF]
+; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br	x[[DEST]]
+  %tmp = load i32 ()** @t
+  %tmp.upgrd.2 = tail call i32 %tmp()
+  ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b	_t2
+  tail call void @t2()
+  ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b	_sin
+  %tmp = tail call double @sin(double %a) nounwind readonly
+  ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b	_sinf
+  %tmp = tail call float @sinf(float %a) nounwind readonly
+  ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b	_foo
+; CHECK: b	_bar
+
+  br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph:                                        ; preds = %entry
+  tail call void @bar() nounwind
+  ret void
+
+bb:                                               ; preds = %entry
+  tail call void @foo() nounwind
+  ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b	_a
+; CHECK: b	_b
+; CHECK: b	_c
+  %and = and i32 %x, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @a(i32 %x) nounwind
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %and1 = and i32 %x, 2
+  %tobool2 = icmp eq i32 %and1, 0
+  br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  %call4 = tail call i32 @b(i32 %x) nounwind
+  br label %return
+
+if.end5:                                          ; preds = %if.end
+  %call6 = tail call i32 @c(i32 %x) nounwind
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+  ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)

Added: llvm/trunk/test/CodeGen/ARM64/cast-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/cast-opt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/cast-opt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/cast-opt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+ at block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: csinc
+; CHECK-NOT: and
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+  ret i8 %retval.0
+}

Added: llvm/trunk/test/CodeGen/ARM64/ccmp-heuristics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ccmp-heuristics.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ccmp-heuristics.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ccmp-heuristics.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+ at channelColumns = external global i64
+ at channelTracks = external global i64
+ at mazeRoute = external hidden unnamed_addr global i8*, align 8
+ at TOP = external global i64*
+ at BOT = external global i64*
+ at netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+  %0 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp90 = icmp eq i64 %0, 0
+  br i1 %cmp90, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  %2 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+  %3 = load i8* %arrayidx, align 1, !tbaa !1
+  %tobool = icmp eq i8 %3, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %4 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+  %5 = load i64* %arrayidx1, align 8, !tbaa !0
+  %6 = load i64** @netsAssign, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+  %7 = load i64* %arrayidx2, align 8, !tbaa !0
+  %8 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+  %9 = load i64* %arrayidx3, align 8, !tbaa !0
+  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+  %10 = load i64* %arrayidx4, align 8, !tbaa !0
+  %cmp5 = icmp ugt i64 %i.092, 1
+  %cmp6 = icmp ugt i64 %10, 1
+  %or.cond = and i1 %cmp5, %cmp6
+  br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7:                                   ; preds = %if.then
+  %11 = load i64* @channelTracks, align 8, !tbaa !0
+  %add = add i64 %11, 1
+  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+  %tobool8 = icmp eq i32 %call, 0
+  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
+  %.pre = load i64* @channelColumns, align 8, !tbaa !0
+  br label %if.else
+
+if.then9:                                         ; preds = %land.lhs.true7
+  %12 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+  %13 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+  %14 = load i64* %arrayidx11, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %14)
+  %15 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+  %16 = load i64* %arrayidx12, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %16)
+  br label %for.inc
+
+if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+  %cmp13 = icmp ult i64 %i.092, %17
+  %or.cond89 = and i1 %cmp13, %cmp6
+  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16:                                  ; preds = %if.else
+  %18 = load i64* @channelTracks, align 8, !tbaa !0
+  %add17 = add i64 %18, 1
+  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+  %tobool19 = icmp eq i32 %call18, 0
+  br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %land.lhs.true16
+  %19 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+  %20 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+  %21 = load i64* %arrayidx22, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %21)
+  %22 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+  %23 = load i64* %arrayidx23, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %23)
+  br label %for.inc
+
+if.else24:                                        ; preds = %land.lhs.true16, %if.else
+  br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26:                                  ; preds = %if.else24
+  %24 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp27 = icmp ult i64 %7, %24
+  br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28:                                  ; preds = %land.lhs.true26
+  %add29 = add i64 %24, 1
+  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+  %tobool31 = icmp eq i32 %call30, 0
+  br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32:                                        ; preds = %land.lhs.true28
+  %25 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+  %26 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+  %27 = load i64* %arrayidx34, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %27)
+  %28 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+  %29 = load i64* %arrayidx35, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %29)
+  br label %for.inc
+
+if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+  %30 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp37 = icmp ult i64 %i.092, %30
+  br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38:                                  ; preds = %if.else36
+  %31 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp39 = icmp ult i64 %7, %31
+  br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40:                                  ; preds = %land.lhs.true38
+  %add41 = add i64 %31, 1
+  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+  %tobool43 = icmp eq i32 %call42, 0
+  br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44:                                        ; preds = %land.lhs.true40
+  %32 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+  %33 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+  %34 = load i64* %arrayidx46, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %34)
+  %35 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+  %36 = load i64* %arrayidx47, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %36)
+  br label %for.inc
+
+if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+  %inc = add nsw i32 %numLeft.091, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+  %inc53 = add i64 %i.092, 1
+  %37 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp = icmp ugt i64 %inc53, %37
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}

Added: llvm/trunk/test/CodeGen/ARM64/ccmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ccmp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ccmp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ccmp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sle i32 %a, 5
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %cmp2 = icmp slt i32 %cond, 17
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %and = and i32 %cond, 8
+  %cmp2 = icmp ne i32 %and, 0
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp slt i32 %div, 17
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %conv = sitofp i32 %a to float
+  %div = fdiv float %b, %conv
+  %cmp1 = fcmp oge float %div, 1.700000e+01
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp eq i32 %div, 5
+  %cmp4 = icmp sgt i32 %div, %c
+  %or.cond = and i1 %cmp1, %cmp4
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 32
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+  switch i32 undef, label %sw.bb.i.i [
+    i32 69, label %if.end85
+    i32 70, label %if.end85
+    i32 71, label %if.end85
+    i32 72, label %if.end85
+    i32 73, label %if.end85
+    i32 105, label %if.end85
+    i32 106, label %if.end85
+  ]
+
+if.end85:
+  ret void
+
+sw.bb.i.i:
+  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+  %0 = load %str1** %arrayidx.i.i, align 8
+  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+  br label %sw.bb.i.i
+}

Added: llvm/trunk/test/CodeGen/ARM64/coalesce-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/coalesce-ext.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/coalesce-ext.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/coalesce-ext.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: sxtw x[[EXT:[0-9]+]], x[[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: str x[[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: str w[[SUM]]
+  ret i32 %D
+}

Added: llvm/trunk/test/CodeGen/ARM64/code-model-large-abs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/code-model-large-abs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/code-model-large-abs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/code-model-large-abs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+ at var8 = global i8 0
+ at var16 = global i16 0
+ at var32 = global i32 0
+ at var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+  ret i8* @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+  %val = load i8* @var8
+  ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+  %val = load i16* @var16
+  ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+  %val = load i32* @var32
+  ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+  %val = load i64* @var64
+  ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}

Added: llvm/trunk/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/collect-loh-garbage-crash.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/collect-loh-garbage-crash.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/collect-loh-garbage-crash.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+ at pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+  br label %if.then83
+if.then83:                                        ; preds = %if.end81
+  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+  br i1 %tobool.i269, label %if.then83, label %end
+end:
+  ret void
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/collect-loh-str.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/collect-loh-str.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/collect-loh-str.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/collect-loh-str.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+ at pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+  ret i32 0
+}
+
+

Added: llvm/trunk/test/CodeGen/ARM64/collect-loh.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/collect-loh.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/collect-loh.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/collect-loh.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+
+ at a = internal unnamed_addr global i32 0, align 4
+ at b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+  %cmp = icmp sgt i32 %t, 5
+  br i1 %cmp, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %entry
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  %cmp1 = icmp sgt i32 %add, 12
+  br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2:                                         ; preds = %if.then
+  tail call void @foo(i32 %add)
+  %tmp1 = load i32* @a, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then2, %if.then, %entry
+  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+  %tmp2 = load i32* @b, align 4
+  %add5 = add nsw i32 %tmp2, %t.addr.0
+  tail call void @foo(i32 %add5)
+  %tmp3 = load i32* @b, align 4
+  %add6 = add nsw i32 %tmp3, %t.addr.0
+  ret i32 %add6
+}

Added: llvm/trunk/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S (added)
+++ llvm/trunk/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
+
+        .text
+        .globl _foo
+        .cfi_startproc
+_foo:
+        stp x29, x30, [sp, #-16]!
+ .cfi_adjust_cfa_offset 16
+
+        ldp x29, x30, [sp], #16
+ .cfi_adjust_cfa_offset -16
+        .cfi_restore x29
+        .cfi_restore x30
+
+        ret
+
+        .cfi_endproc

Added: llvm/trunk/test/CodeGen/ARM64/complex-ret.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/complex-ret.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/complex-ret.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/complex-ret.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}

Added: llvm/trunk/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/convert-v2f64-v2i32.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/convert-v2f64-v2i32.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/convert-v2f64-v2i32.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+  %0 = fptoui <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/convert-v2i32-v2f64.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/convert-v2i32-v2f64.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/convert-v2i32-v2f64.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+  %T = load <2 x i64>* undef
+  %F = sitofp <2 x i64> undef to <2 x float>
+  store <2 x float> %F, <2 x float>* undef
+  ret void
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/copy-tuple.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/copy-tuple.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/copy-tuple.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/copy-tuple.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: orr.8b v2, v1
+; CHECK: orr.8b v1, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: orr.8b v0, v1
+; CHECK: orr.8b v1, v2
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: orr.8b v1, v0
+; CHECK: orr.8b v0, v31
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: orr.8b v31, v0
+; CHECK: orr.8b v0, v1
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: orr.8b v4, v2
+; CHECK: orr.8b v3, v1
+; CHECK: orr.8b v2, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: orr.16b v0, v1
+; CHECK: orr.16b v1, v2
+; CHECK: orr.16b v2, v3
+entry:
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: orr.16b v4, v1
+; CHECK: orr.16b v3, v0
+; CHECK: orr.16b v2, v31
+; CHECK: orr.16b v1, v30
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)

Added: llvm/trunk/test/CodeGen/ARM64/crc32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/crc32.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/crc32.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/crc32.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm64.crc32b(i32, i32)
+declare i32 @llvm.arm64.crc32h(i32, i32)
+declare i32 @llvm.arm64.crc32w(i32, i32)
+declare i32 @llvm.arm64.crc32x(i32, i64)
+
+declare i32 @llvm.arm64.crc32cb(i32, i32)
+declare i32 @llvm.arm64.crc32ch(i32, i32)
+declare i32 @llvm.arm64.crc32cw(i32, i32)
+declare i32 @llvm.arm64.crc32cx(i32, i64)

Added: llvm/trunk/test/CodeGen/ARM64/crypto.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/crypto.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/crypto.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/crypto.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}

Added: llvm/trunk/test/CodeGen/ARM64/cse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/cse.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/cse.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/cse.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK_NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}

Added: llvm/trunk/test/CodeGen/ARM64/csel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/csel.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/csel.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/csel.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,222 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK: foo1
+; CHECK: csinc w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %add = zext i1 %not.tobool to i32
+  %b.add = add i32 %c, %b
+  %add1 = add i32 %b.add, %add
+  ret i32 %add1
+}
+
+; CHECK: foo2
+; CHECK: csneg w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %mul = sub i32 0, %b
+  %tobool = icmp eq i32 %c, 0
+  %b.mul = select i1 %tobool, i32 %b, i32 %mul
+  %add = add nsw i32 %b.mul, %c
+  ret i32 %add
+}
+
+; CHECK: foo3
+; CHECK: csinv w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %xor = sext i1 %not.tobool to i32
+  %b.xor = xor i32 %xor, %b
+  %add = add nsw i32 %b.xor, %c
+  ret i32 %add
+}
+
+; rdar://11632325
+define i32 at foo4(i32 %a) nounwind ssp {
+; CHECK: foo4
+; CHECK: csneg
+; CHECK-NEXT: ret
+  %cmp = icmp sgt i32 %a, -1
+  %neg = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %a, i32 %neg
+  ret i32 %cond
+}
+
+define i32 at foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: foo5
+; CHECK: subs
+; CHECK-NEXT: csneg
+; CHECK-NEXT: ret
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32 at foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK: foo6
+; CHECK: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+  ret i32 1
+
+l.else:
+  ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp slt i32 %sub, -1
+  %sel = select i1 %cmp2, i32 %cond, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %neg = xor i32 -1, %b
+  %cond = select i1 %tobool, i32 %neg, i32 %a
+  ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csinv w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -5
+  ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csinv x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -5
+  ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csneg w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -4
+  ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csneg x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -4
+  ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %sub = sub i32 0, %b
+  %cond = select i1 %tobool, i32 %sub, i32 %a
+  ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+  %tobool = icmp eq i64 %v, 0
+  %sub = sub i64 0, %b
+  %cond = select i1 %tobool, i64 %sub, i64 %a
+  ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], le
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 2, i32 1
+  ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], gt
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 1, i32 2
+  ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], le
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 2, i64 1
+  ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], gt
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 1, i64 2
+  ret i64 %.
+}

Added: llvm/trunk/test/CodeGen/ARM64/cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/cvt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/cvt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/cvt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/dagcombiner-convergence.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/dagcombiner-convergence.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/dagcombiner-convergence.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/dagcombiner-convergence.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+  %tmp = lshr i128 %Params.coerce, 61
+  %.tr38.i = trunc i128 %tmp to i64
+  %mul.i = and i64 %.tr38.i, 4294967288
+  %tmp1 = lshr i128 %SelLocs.coerce, 62
+  %.tr.i = trunc i128 %tmp1 to i64
+  %mul7.i = and i64 %.tr.i, 4294967292
+  %add.i = add i64 %mul7.i, %mul.i
+  %conv.i.i = and i64 %add.i, 4294967292
+  ret i64 %conv.i.i
+}

Added: llvm/trunk/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/dagcombiner-load-slicing.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/dagcombiner-load-slicing.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/dagcombiner-load-slicing.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %0 = bitcast %class.Complex* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %4 = load float* %i.i, align 4
+  %add.i = fadd float %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %5 = load float* %r.i, align 4
+  %add5.i = fadd float %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+  %0 = bitcast %class.Complex_int* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+  %4 = load i32* %i.i, align 4
+  %add.i = add i32 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+  %5 = load i32* %r.i, align 4
+  %add5.i = add i32 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+  %0 = bitcast %class.Complex_long* %arrayidx to i128*
+  %1 = load i128* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+  %4 = load i64* %i.i, align 4
+  %add.i = add i64 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+  %5 = load i64* %r.i, align 4
+  %add5.i = add i64 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/dup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/dup.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/dup.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/dup.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,322 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+  ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+  ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
+; CHECK: dup.4h v0, [[WTMP]]
+  %val = extractelement <4 x i32> %in, i32 3
+  %smallval = trunc i32 %val to i16
+  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+  ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}

Added: llvm/trunk/test/CodeGen/ARM64/early-ifcvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/early-ifcvt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/early-ifcvt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/early-ifcvt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i32 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i64 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp eq i32 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp eq i64 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp ne i32 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp ne i64 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:
+  %x0 = load i32* undef, align 4
+  br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+  %add8.i143 = add nsw i32 0, %x0
+  %rem.i144 = srem i32 %add8.i143, %x0
+  %add9.i145 = add i32 %rem.i144, 1
+  br label %is_sbox.exit155
+
+is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
+  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+  %x1 = load i32* %arrayidx18.i154, align 4
+  br i1 undef, label %for.body51, label %for.body
+
+for.body51:                                       ; preds = %is_sbox.exit155
+  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+  unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp

Added: llvm/trunk/test/CodeGen/ARM64/elf-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/elf-calls.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/elf-calls.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/elf-calls.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+  call void @callee()
+  ret void
+; CHECK-LABEL: caller:
+; CHECK:     bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+  tail call void @callee()
+  ret void
+; CHECK-LABEL: tail_caller:
+; CHECK:     b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}

Added: llvm/trunk/test/CodeGen/ARM64/elf-constpool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/elf-constpool.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/elf-constpool.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/elf-constpool.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+  ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}

Added: llvm/trunk/test/CodeGen/ARM64/elf-globals.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/elf-globals.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/elf-globals.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/elf-globals.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+ at var8 = external global i8, align 1
+ at var16 = external global i16, align 2
+ at var32 = external global i32, align 4
+ at var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+  %val = load i8* @var8, align 1
+  store i8 %new, i8* @var8
+  ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+  %val = load i16* @var16, align 2
+  store i16 %new, i16* @var16
+  ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+  %val = load i32* @var32, align 4
+  store i32 %new, i32* @var32
+  ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+  %val = load i64* @var64, align 8
+  store i64 %new, i64* @var64
+  ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+  ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+ at hiddenvar = hidden global i32 0, align 4
+ at protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+  %lhs = load i32* @hiddenvar, align 4
+  %rhs = load i32* @protectedvar, align 4
+  %ret = add i32 %lhs, %rhs
+  ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+ at var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+  %val = load i32* %addr
+  ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+  %addr = bitcast [2 x i32]* @var_default to i64*
+  %val = load i64* %addr
+  ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}

Added: llvm/trunk/test/CodeGen/ARM64/ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ext.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ext.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ext.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+	ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}

Added: llvm/trunk/test/CodeGen/ARM64/extend-int-to-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/extend-int-to-fp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/extend-int-to-fp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/extend-int-to-fp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s	v0, v0, #0
+; CHECK-NEXT: ucvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s	v0, v0, #0
+; CHECK-NEXT: scvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}

Added: llvm/trunk/test/CodeGen/ARM64/extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/extend.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/extend.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/extend.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+ at array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK:  adrp  x[[REG:[0-9]+]], _array at GOTPAGE
+; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array at GOTPAGEOFF]
+; CHECK:  ldrsw x0, [x[[REG1]], x0, sxtw #2]
+; CHECK:  ret
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx, align 4
+  %conv = sext i32 %tmp1 to i64
+  ret i64 %conv
+}

Added: llvm/trunk/test/CodeGen/ARM64/extload-knownzero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/extload-knownzero.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/extload-knownzero.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/extload-knownzero.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar ()

Added: llvm/trunk/test/CodeGen/ARM64/extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/extract.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/extract.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/extract.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,58 @@
+; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/extract_subvector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/extract_subvector.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/extract_subvector.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/extract_subvector.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
+  ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
+  ret <1 x double> %ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-addr-offset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-addr-offset.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-addr-offset.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-addr-offset.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+ at sortlist = common global [5001 x i32] zeroinitializer, align 16
+ at sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist at GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist at GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #20000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+  ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2 at GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2 at GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #40000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+  ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+ at pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #2874, lsl #32
+; CHECK: movk x[[REG]], #29646, lsl #16
+; CHECK: movk x[[REG]], #12274
+  %0 = load i8** @pd2, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-alloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-alloca.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-alloca.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-alloca.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,24 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+  %V.addr = alloca %struct.S1Ty*, align 8
+  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+  ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+  %E = alloca %struct.S2Ty, align 4
+  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+  call void @takeS1(%struct.S1Ty* %B)
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-br.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-br.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-br.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-br.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  %1 = load i32* %x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %3, %0
+  ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %1
+  store i32 1, i32* %y, align 4
+  store i32 1, i32* %x, align 4
+  store i32 0, i32* %z, align 4
+  %2 = load i32* %x, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4                                       ; preds = %0
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:5                                       ; preds = %0
+  %6 = load i32* %y, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %13
+
+; <label>:8                                       ; preds = %5
+  %9 = load i32* %z, align 4
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %11, label %12
+
+; <label>:11                                      ; preds = %8
+  store i32 1, i32* %1
+  br label %14
+
+; <label>:12                                      ; preds = %8
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:13                                      ; preds = %5
+  br label %14
+
+; <label>:14                                      ; preds = %4, %11, %12, %13
+  %15 = load i32* %1
+  ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+  br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+  br label %2
+
+; <label>:2
+  ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+  br i1 false, label %1, label %2
+
+; <label>:1
+  br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+  ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+  %conv = trunc i16 %0 to i1
+  br i1 %conv, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+  %conv1 = trunc i32 %1 to i1
+  br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  call void @foo1()
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+  %conv5 = trunc i64 %2 to i1
+  br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  call void @foo1()
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %3 = load i8* %a.addr, align 1
+  ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+  %a = and i64 %foo, 1
+  %b = trunc i64 %a to i1
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  ret i32 1
+
+if.else:
+  ret i32 0
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-call.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-call.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-call.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,91 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @call0() nounwind {
+entry:
+  ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+  call void @call0()
+  ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [fp, #-4]
+; CHECK-NEXT: ldur w0, [fp, #-4]
+; CHECK-NEXT: bl _call1
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %call = call i32 @call1(i32 %tmp)
+  ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+  call void @foo_sext_(i8 signext %a, i16 signext %b)
+  ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+  ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers.  Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl	_func2
+  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+  ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-conversion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-conversion.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-conversion.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-conversion.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,416 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i64* %d.addr, align 8
+  %conv = trunc i64 %tmp to i32
+  store i32 %conv, i32* %c.addr, align 4
+  %tmp1 = load i32* %c.addr, align 4
+  %conv2 = trunc i32 %tmp1 to i16
+  store i16 %conv2, i16* %b.addr, align 2
+  %tmp3 = load i16* %b.addr, align 2
+  %conv4 = trunc i16 %tmp3 to i8
+  store i8 %conv4, i8* %a.addr, align 1
+  %tmp5 = load i8* %a.addr, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: uxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = zext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = zext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = zext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i32
+  ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i64
+  ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: sxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = sext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = sext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = sext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+define i64 @sext_2(i8 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_2
+; CHECK: sxtb x0, w0
+  %conv = sext i8 %a to i64
+  ret i64 %conv
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i32
+  ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i16
+  ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i8
+  ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+  %conv = fpext float %a to double
+  ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfm w0, w0, #0, #0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i1 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i8 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i16 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i1 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i8 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i16 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+  %conv = trunc i64 %a to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+  %conv = trunc i64 %a to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+  %conv = trunc i64 %a to i8
+  ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+  %conv = trunc i64 %a to i1
+  ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub  sp, sp, #16
+; CHECK: ldr  [[REG:x[0-9]+]], [sp]
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add  sp, sp, #16
+  %a = alloca i8, align 1
+  %b = alloca i64, align 8
+  %c = load i64* %b, align 8
+  %d = trunc i64 %c to i8
+  store i8 %d, i8* %a, align 1
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-fcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-fcmp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-fcmp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-fcmp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-gv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-gv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-gv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-gv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+ at seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed at GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed at GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+  store i64 74755, i64* @seed, align 8
+  ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed at GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed at GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #1309
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #13849
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+  %0 = load i64* @seed, align 8
+  %mul = mul nsw i64 %0, 1309
+  %add = add nsw i64 %mul, 13849
+  %and = and i64 %add, 65535
+  store i64 %and, i64* @seed, align 8
+  %1 = load i64* @seed, align 8
+  %conv = trunc i64 %1 to i32
+  ret i32 %conv
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-icmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-icmp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-icmp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-icmp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp  w0, #31
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, 31
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn  w0, #7
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, -7
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ls
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cc
+  %cmp = icmp uge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, hi
+  %cmp = icmp ule i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ge
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, gt
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp  x0, x1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = icmp sle i64 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i16 %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i8 %a, %b
+  ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn  w0, #233
+; CHECK: csinc w0, wzr, wzr, ge
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp slt i16 %a, -233
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp  w0, #124
+; CHECK: csinc w0, wzr, wzr, le
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp sgt i8 %a, 124
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp  w0, #0
+; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp ult i1 %a, 0
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-indirectbr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-indirectbr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-indirectbr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-indirectbr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+ at fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+  %retval = alloca i32, align 4
+  %target.addr = alloca i32, align 4
+  store i32 %target, i32* %target.addr, align 4
+  %0 = load i32* %target.addr, align 4
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+  %1 = load i8** %arrayidx, align 8
+  br label %indirectgoto
+
+ZERO:                                             ; preds = %indirectgoto
+; CHECK: LBB0_1
+  store i32 0, i32* %retval
+  br label %return
+
+ONE:                                              ; preds = %indirectgoto
+; CHECK: LBB0_2
+  store i32 1, i32* %retval
+  br label %return
+
+return:                                           ; preds = %ONE, %ZERO
+  %2 = load i32* %retval
+  ret i32 %2
+
+indirectgoto:                                     ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+  %indirect.goto.dest = phi i8* [ %1, %entry ]
+  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-intrinsic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-intrinsic.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-intrinsic.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-intrinsic.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+ at message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+ at temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64: t1
+; ARM64: adrp x8, _message at PAGE
+; ARM64: add x0, x8, _message at PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #80
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64: t2
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x0, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x8, _message at PAGE
+; ARM64: add x1, x8, _message at PAGEOFF
+; ARM64: movz x2, #80
+; ARM64: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64: t3
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x0, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x8, _message at PAGE
+; ARM64: add x1, x8, _message at PAGEOFF
+; ARM64: movz x2, #20
+; ARM64: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64: t4
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x8, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x9, _message at PAGE
+; ARM64: add x9, x9, _message at PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+  ret void
+}
+
+define void @t5() {
+; ARM64: t5
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x8, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x9, _message at PAGE
+; ARM64: add x9, x9, _message at PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+  ret void
+}
+
+define void @t6() {
+; ARM64: t6
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x8, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x9, _message at PAGE
+; ARM64: add x9, x9, _message at PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+  ret void
+}
+
+define void @t7() {
+; ARM64: t7
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x8, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x9, _message at PAGE
+; ARM64: add x9, x9, _message at PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+  ret void
+}
+
+define void @t8() {
+; ARM64: t8
+; ARM64: adrp x8, _temp at GOTPAGE
+; ARM64: ldr x8, [x8, _temp at GOTPAGEOFF]
+; ARM64: adrp x9, _message at PAGE
+; ARM64: add x9, x9, _message at PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-materialize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-materialize.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-materialize.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-materialize.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.250000e+00
+  store float 1.250000e+00, float* %value, align 4
+  ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.250000e+00
+  store double 1.250000e+00, double* %value, align 8
+  ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+  ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+  ret double 0x400921FB54442D18
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-noconvert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-noconvert.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-noconvert.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-noconvert.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+  %res = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+  %res = sitofp <2 x i32> %in to <2 x double>
+  ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+  %res = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+  %res = fptosi <2 x double> %in to <2 x i64>
+  ret <2 x i64> %res
+}
\ No newline at end of file

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-rem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-rem.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-rem.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-rem.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,33 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-ret.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-ret.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-ret.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-ret.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+  ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %tmp = load i64* %a.addr, align 8
+  ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth	w0, w0
+  %a.addr = alloca i16, align 1
+  store i16 %a, i16* %a.addr, align 1
+  %0 = load i16* %a.addr, align 1
+  ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb	w0, w0
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8* %a.addr, align 1
+  ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel-select.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel-select.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel-select.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i32 123, i32 357
+  ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i64 123, i64 357
+  ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = select i1 %c, i32 %a, i32 %b
+  ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = select i1 %c, i64 %a, i64 %b
+  ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fast-isel.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fast-isel.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr
+  %tmp = load i32* %a.addr
+  store i32 %tmp, i32* %a.addr
+  ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 4
+  store i64 %a, i64* %a.addr
+  %tmp = load i64* %a.addr
+  store i64 %tmp, i64* %a.addr
+  ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+
+declare void @llvm.trap() nounwind

Added: llvm/trunk/test/CodeGen/ARM64/fastcc-tailcall.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fastcc-tailcall.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fastcc-tailcall.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fastcc-tailcall.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b       {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %3, label %2
+
+  tail call fastcc void @callee(i32* %p, i32 %a) optsize
+  br label %3
+
+  ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+  store volatile i32 %a, i32* %p, align 4, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}

Added: llvm/trunk/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/fcmp-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fcmp-opt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fcmp-opt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fcmp-opt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,173 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_one
+;	fcmp	s0, s1
+;	orr	w0, wzr, #0x1
+;	csel	w1, w0, wzr, mi
+;	csel	w0, w0, wzr, gt
+  %cmp = fcmp one float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ueq
+; CHECK: fcmp s0, s1
+;        orr w0, wzr, #0x1
+; CHECK: csel [[REG1:w[0-9]]], [[REG2:w[0-9]+]], wzr, eq
+; CHECK: csel {{w[0-9]+}}, [[REG2]], [[REG1]], vs
+  %cmp = fcmp ueq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}

Added: llvm/trunk/test/CodeGen/ARM64/fcopysign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fcopysign.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fcopysign.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fcopysign.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s	v2, #128, lsl #24
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+  ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d	v2, #0
+; CHECK: fneg.2d	v2, v2
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+  ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+  %tmp1 = fadd float %b, %c
+  %tmp2 = fpext float %tmp1 to double
+  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+  ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+  %vecext = extractelement <2 x i64> %a, i32 1
+  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  ret double %fcvt_n
+}
+
+declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/fmadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fmadd.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fmadd.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fmadd.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %mul = fmul float %0, -1.000000e+00
+  ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+  ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub
+  %mul = fmul float %c, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+  ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  %mul = fmul double %0, -1.000000e+00
+  ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+  ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub
+entry:
+  %mul = fmul double %c, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+  ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/fmax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fmax.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fmax.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fmax.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}

Added: llvm/trunk/test/CodeGen/ARM64/fmuladd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fmuladd.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fmuladd.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fmuladd.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load float* %A
+  %tmp2 = load float* %B
+  %tmp3 = load float* %C
+  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+  ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <8 x float>* %A
+  %tmp2 = load <8 x float>* %B
+  %tmp3 = load <8 x float>* %C
+  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+  ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load double* %A
+  %tmp2 = load double* %B
+  %tmp3 = load double* %C
+  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+  ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <4 x double>* %A
+  %tmp2 = load <4 x double>* %B
+  %tmp3 = load <4 x double>* %C
+  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+  ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/fold-address.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fold-address.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fold-address.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fold-address.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}

Added: llvm/trunk/test/CodeGen/ARM64/fold-lsl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fold-lsl.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fold-lsl.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fold-lsl.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  store i16 %val, i16* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  store i32 %val, i32* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  store i64 %val, i64* %arrayidx86, align 8
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/fp-imm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fp-imm.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fp-imm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fp-imm.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad  4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0 at PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI0_0 at PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0 at PAGE
+; CHECK:  ldr s0, [x[[REG]], lCPI1_0 at PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB60000000
+}

Added: llvm/trunk/test/CodeGen/ARM64/fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel	s0, s0, s1, ne
+  %sel = select i1 %a, float %b, float %c
+  ret float %sel
+}

Added: llvm/trunk/test/CodeGen/ARM64/fp128-folding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fp128-folding.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fp128-folding.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fp128-folding.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+  ret fp128 %fpval
+}

Added: llvm/trunk/test/CodeGen/ARM64/fp128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/fp128.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/fp128.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/fp128.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,274 @@
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+ at lhs = global fp128 zeroinitializer, align 16
+ at rhs = global fp128 zeroinitializer, align 16
+
+define fp128 @test_add() {
+; CHECK-LABEL: test_add:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK-LABEL: test_sub:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK-LABEL: test_mul:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK-LABEL: test_div:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+ at var32 = global i32 0
+ at var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK-LABEL: test_fptosi:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK-LABEL: test_fptoui:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK-LABEL: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK-LABEL: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK-LABEL: test_setcc1:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: csinc w0, wzr, wzr, gt
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK-LABEL: test_setcc2:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __gttf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK-LABEL: test_br_cc:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __getf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz w0, #42
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz w0, #29
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK-LABEL: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs, align 16
+; CHECK: and [[BIT:w[0-9]+]], w0, #0x1
+; CHECK: cmp [[BIT]], #0
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: orr v[[VAL:[0-9]+]].16b, v0.16b, v0.16b
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+ at varfloat = global float 0.0, align 4
+ at vardouble = global double 0.0, align 8
+
+define void @test_round() {
+; CHECK-LABEL: test_round:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat, align 4
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble, align 8
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK-LABEL: test_extend:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs, align 16
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs, align 16
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK-LABEL: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: orr v1.16b, v0.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/frame-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/frame-index.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/frame-index.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/frame-index.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+  %v = alloca [288 x i32], align 4
+  unreachable
+}

Added: llvm/trunk/test/CodeGen/ARM64/frameaddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/frameaddr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/frameaddr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/frameaddr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: mov x0, fp
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/global-address.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/global-address.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/global-address.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/global-address.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+ at G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G at GOTPAGE
+; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G at GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+  %tmp = ptrtoint i32* @G to i32
+  %tmp1 = add i32 %tmp, %off
+  ret i32 %tmp1
+}

Added: llvm/trunk/test/CodeGen/ARM64/hello.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/hello.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/hello.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/hello.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK:	stp	fp, lr, [sp, #-16]!
+; CHECK-NEXT:	mov	fp, sp
+; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK-NEXT:	stur	wzr, [fp, #-4]
+; CHECK:	adrp	x0, L_.str at PAGE
+; CHECK:	add	x0, x0, L_.str at PAGEOFF
+; CHECK-NEXT:	bl	_puts
+; CHECK-NEXT:	mov	sp, fp
+; CHECK-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-NEXT:	ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX:	stp	fp, lr, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	fp, sp
+; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	stur	wzr, [fp, #-4]
+; CHECK-LINUX:	adrp	x0, .L.str
+; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	mov	sp, fp
+; CHECK-LINUX-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-LINUX-NEXT:	ret
+
+ at .str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+}
+
+declare i32 @puts(i8*)

Added: llvm/trunk/test/CodeGen/ARM64/i16-subreg-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/i16-subreg-extract.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/i16-subreg-extract.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/i16-subreg-extract.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+  %tmp18 = load <4 x i16>* %__a, align 8
+  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+  %conv = zext i16 %vget_lane to i32
+  %mul = mul nsw i32 3, %conv
+  ret i32 %mul
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/icmp-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/icmp-opt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/icmp-opt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/icmp-opt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+;   for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp  x0, #0
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sgt i64 %a, -1
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}

Added: llvm/trunk/test/CodeGen/ARM64/illegal-float-ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/illegal-float-ops.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/illegal-float-ops.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/illegal-float-ops.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,247 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+ at varfloat = global float 0.0
+ at vardouble = global double 0.0
+ at varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+  ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/indexed-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/indexed-memory.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/indexed-memory.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/indexed-memory.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load i64** %out, align 8
+  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+  store i64 %spacing, i64* %tmp, align 4
+  store i64* %incdec.ptr, i64** %out, align 8
+  ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  store i32 %spacing, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  store i16 %spacing, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  store i8 %spacing, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load float** %out, align 8
+  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+  store float %spacing, float* %tmp, align 4
+  store float* %incdec.ptr, float** %out, align 8
+  ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load double** %out, align 8
+  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+  store double %spacing, double* %tmp, align 4
+  store double* %incdec.ptr, double** %out, align 8
+  ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     d0, [x0, #32]!
+; CHECK-NEXT: ret
+  %tmp = load double** %out, align 8
+  %ptr = getelementptr inbounds double* %tmp, i64 4
+  store double %spacing, double* %ptr, align 4
+  ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     s0, [x0, #12]!
+; CHECK-NEXT: ret
+  %tmp = load float** %out, align 8
+  %ptr = getelementptr inbounds float* %tmp, i64 3
+  store float %spacing, float* %ptr, align 4
+  ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     x1, [x0, #16]!
+; CHECK-NEXT: ret
+  %tmp = load i64** %out, align 8
+  %ptr = getelementptr inbounds i64* %tmp, i64 2
+  store i64 %spacing, i64* %ptr, align 4
+  ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  store i32 %spacing, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  store i16 %spacing, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  store i8 %spacing, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr     d0, [x0, #8]!
+; CHECK: str     d0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds double* %src, i64 1
+  %tmp = load double* %ptr, align 4
+  store double %tmp, double* %out, align 4
+  ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr     s0, [x0, #4]!
+; CHECK: str     s0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds float* %src, i64 1
+  %tmp = load float* %ptr, align 4
+  store float %tmp, float* %out, align 4
+  ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i64* %src, i64 1
+  %tmp = load i64* %ptr, align 4
+  store i64 %tmp, i64* %out, align 4
+  ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  store i32 %tmp, i32* %out, align 4
+  ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  %ext = sext i32 %tmp to i64
+  store i64 %ext, i64* %out, align 8
+  ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-I.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-I.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-I.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-I.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-J.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-J.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-J.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-J.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-K.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-K.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-K.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-K.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-L.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-L.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-L.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-L.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-M.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-M.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-M.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-M.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-error-N.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-error-N.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-error-N.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-error-N.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+  ret i32 %0
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/inline-asm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/inline-asm.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/inline-asm.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/inline-asm.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+  ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+  ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+  ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+  %f.addr = alloca i8*, align 8
+  store i8* %f, i8** %f.addr, align 8
+  ; CHECK-LABEL: t7:
+  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+  ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_I:
+  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
+  ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_J:
+  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_KL:
+  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+  ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_MN:
+  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+  ; CHECK: movk  {{w[0-9]+}}, #65535
+  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+  ; CHECK: movz  {{w[0-9]+}}, #0
+  ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+  ; CHECK-LABEL: t9:
+  %data = alloca <2 x double>, align 16
+  %0 = load <2 x double>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+  ; CHECK: mov.2d v4, {{v[0-9]+}}
+  ret void
+}
+
+define void @t10() nounwind {
+entry:
+  ; CHECK-LABEL: t10:
+  %data = alloca <2 x float>, align 8
+  %a = alloca [2 x float], align 4
+  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+  %0 = load <2 x float>* %data, align 8
+  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+  ret void
+}
+
+define void @t11() nounwind {
+entry:
+  ; CHECK-LABEL: t11:
+  %a = alloca i32, align 4
+  %0 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+  ; CHECK: mov xzr, {{x[0-9]+}}
+  %1 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+  ; CHECK: mov wzr, {{w[0-9]+}}
+  ret void
+}
+
+define void @t12() nounwind {
+entry:
+  ; CHECK-LABEL: t12:
+  %data = alloca <4 x float>, align 16
+  %0 = load <4 x float>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+  ret void
+}
+
+define void @t13() nounwind {
+entry:
+  ; CHECK-LABEL: t13:
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+  ; CHECK: mov x4, #1311673391471656960
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+  ; CHECK: mov x4, #-4662
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+  ; CHECK: mov x4, #4660
+  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+  ; CHECK: mov x4, #-71777214294589696
+  ret void
+}
+
+define void @t14() nounwind {
+entry:
+  ; CHECK-LABEL: t14:
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+  ; CHECK: mov w4, #305397760
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+  ; CHECK: mov w4, #4294962634
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+  ; CHECK: mov w4, #4660
+  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+  ; CHECK: mov w4, #4278255360
+  ret void
+}
+
+define void @t15() nounwind {
+entry:
+  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+  ; CHECK: fmov {{x[0-9]+}}, d8
+  ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/join-reserved.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/join-reserved.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/join-reserved.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/join-reserved.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  ret void
+}
+
+declare void @f(i32, ...)

Added: llvm/trunk/test/CodeGen/ARM64/jumptable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/jumptable.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/jumptable.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/jumptable.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 undef, i32* %to
+  br label %exit
+bb2:
+  store i32 undef, i32* %to
+  br label %exit
+bb3:
+  store i32 undef, i32* %to
+  br label %exit
+bb4:
+  store i32 undef, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp    {{x[0-9]+}}, LJTI0_0 at PAGE
+; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0 at PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0

Added: llvm/trunk/test/CodeGen/ARM64/ld1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ld1.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ld1.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ld1.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,1254 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x2_t  %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x3_t  %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+  ret <2 x i64> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+  %tmp = load <1 x i64>* %p, align 8
+  ret <1 x i64> %tmp
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+  %tmp = bitcast i8* %a to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmp4 = bitcast i8* %b to i32*
+  %tmp5 = load i32* %tmp4, align 4
+  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %tmp10 = bitcast i16* %diff to <4 x i16>*
+  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+  ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+  ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}

Added: llvm/trunk/test/CodeGen/ARM64/ldp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ldp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ldp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ldp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+  %tmp = load i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  %tmp1 = load i64* %add.ptr, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+  %tmp = load float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  %tmp1 = load float* %add.ptr, align 4
+  %add = fadd float %tmp, %tmp1
+  ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+  %tmp = load double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  %tmp1 = load double* %add.ptr, align 8
+  %add = fadd double %tmp, %tmp1
+  ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -1
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -2
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %a, i64 -1
+  %tmp1 = load float* %p1, align 2
+  %p2 = getelementptr inbounds float* %a, i64 -2
+  %tmp2 = load float* %p2, align 2
+  %tmp3 = fadd float %tmp1, %tmp2
+  ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %a, i64 -1
+  %tmp1 = load double* %p1, align 2
+  %p2 = getelementptr inbounds double* %a, i64 -2
+  %tmp2 = load double* %p2, align 2
+  %tmp3 = fadd double %tmp1, %tmp2
+  ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -31
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -32
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -32
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -33
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -18
+  %bp1 = bitcast i64* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i64*
+  %tmp1 = load i64* %dp1, align 1
+
+  %p2 = getelementptr inbounds i64* %a, i64 -17
+  %bp2 = bitcast i64* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i64*
+  %tmp2 = load i64* %dp2, align 1
+
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}

Added: llvm/trunk/test/CodeGen/ARM64/ldur.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ldur.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ldur.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ldur.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i64* %p, i64 -1
+  %ret = load i64* %tmp, align 2
+  ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i32* %p, i64 -1
+  %ret = load i32* %tmp, align 2
+  ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i16* %p, i64 -1
+  %ret = load i16* %tmp, align 2
+  ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i8* %p, i64 -1
+  %ret = load i8* %tmp, align 2
+  ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i32*
+  %tmp2 = load i32* %tmp1, align 4
+  %ret = zext i32 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i16*
+  %tmp2 = load i16* %tmp1, align 2
+  %ret = zext i16 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp2 = load i8* %p, align 1
+  %ret = zext i8 %tmp2 to i64
+
+  ret i64 %ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/ldxr-stxr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ldxr-stxr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ldxr-stxr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/ldxr-stxr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,143 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.arm64.ldxp(i8*) nounwind
+declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
+
+ at var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.arm64.clrex()
+  ret void
+}
+
+declare void @llvm.arm64.clrex() nounwind
+

Added: llvm/trunk/test/CodeGen/ARM64/leaf-compact-unwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/leaf-compact-unwind.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/leaf-compact-unwind.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/leaf-compact-unwind.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,161 @@
+; Use the -disable-cfi flag so that we get the compact unwind info in the
+; emitted assembly. Compact unwind info is omitted when CFI directives
+; are emitted.
+;
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -disable-cfi < %s | FileCheck %s
+;
+; rdar://13070556
+
+ at bar = common global i32 0, align 4
+
+; Leaf function with no stack allocation and no saving/restoring
+; of non-volatile registers.
+define i32 @foo1(i32 %a) #0 {
+entry:
+  %add = add nsw i32 %a, 42
+  ret i32 %add
+}
+
+; Leaf function with stack allocation but no saving/restoring
+; of non-volatile registers.
+define i32 @foo2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [36 x i32], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.body ]
+  %arrayidx = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv19
+  %0 = trunc i64 %indvars.iv19 to i32
+  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next20 = add i64 %indvars.iv19, 1
+  %lftr.wideiv21 = trunc i64 %indvars.iv.next20 to i32
+  %exitcond22 = icmp eq i32 %lftr.wideiv21, 36
+  br i1 %exitcond22, label %for.body4, label %for.body
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body ]
+  %z1.016 = phi i32 [ %add, %for.body4 ], [ 0, %for.body ]
+  %arrayidx6 = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv
+  %1 = load i32* %arrayidx6, align 4, !tbaa !0
+  %add = add nsw i32 %1, %z1.016
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 36
+  br i1 %exitcond, label %for.end9, label %for.body4
+
+for.end9:                                         ; preds = %for.body4
+  ret i32 %add
+}
+
+; Leaf function with no stack allocation but with saving restoring of
+; non-volatile registers.
+define i32 @foo3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #1 {
+entry:
+  %0 = load volatile i32* @bar, align 4, !tbaa !0
+  %1 = load volatile i32* @bar, align 4, !tbaa !0
+  %2 = load volatile i32* @bar, align 4, !tbaa !0
+  %3 = load volatile i32* @bar, align 4, !tbaa !0
+  %4 = load volatile i32* @bar, align 4, !tbaa !0
+  %5 = load volatile i32* @bar, align 4, !tbaa !0
+  %6 = load volatile i32* @bar, align 4, !tbaa !0
+  %7 = load volatile i32* @bar, align 4, !tbaa !0
+  %8 = load volatile i32* @bar, align 4, !tbaa !0
+  %9 = load volatile i32* @bar, align 4, !tbaa !0
+  %10 = load volatile i32* @bar, align 4, !tbaa !0
+  %11 = load volatile i32* @bar, align 4, !tbaa !0
+  %12 = load volatile i32* @bar, align 4, !tbaa !0
+  %13 = load volatile i32* @bar, align 4, !tbaa !0
+  %14 = load volatile i32* @bar, align 4, !tbaa !0
+  %15 = load volatile i32* @bar, align 4, !tbaa !0
+  %16 = load volatile i32* @bar, align 4, !tbaa !0
+  %17 = load volatile i32* @bar, align 4, !tbaa !0
+  %factor = mul i32 %h, -2
+  %factor56 = mul i32 %g, -2
+  %factor57 = mul i32 %f, -2
+  %factor58 = mul i32 %e, -2
+  %factor59 = mul i32 %d, -2
+  %factor60 = mul i32 %c, -2
+  %factor61 = mul i32 %b, -2
+  %sum = add i32 %1, %0
+  %sum62 = add i32 %sum, %2
+  %sum63 = add i32 %sum62, %3
+  %sum64 = add i32 %sum63, %4
+  %sum65 = add i32 %sum64, %5
+  %sum66 = add i32 %sum65, %6
+  %sum67 = add i32 %sum66, %7
+  %sum68 = add i32 %sum67, %8
+  %sum69 = add i32 %sum68, %9
+  %sum70 = add i32 %sum69, %10
+  %sum71 = add i32 %sum70, %11
+  %sum72 = add i32 %sum71, %12
+  %sum73 = add i32 %sum72, %13
+  %sum74 = add i32 %sum73, %14
+  %sum75 = add i32 %sum74, %15
+  %sum76 = add i32 %sum75, %16
+  %sub10 = sub i32 %17, %sum76
+  %sub11 = add i32 %sub10, %factor
+  %sub12 = add i32 %sub11, %factor56
+  %sub13 = add i32 %sub12, %factor57
+  %sub14 = add i32 %sub13, %factor58
+  %sub15 = add i32 %sub14, %factor59
+  %sub16 = add i32 %sub15, %factor60
+  %add = add i32 %sub16, %factor61
+  ret i32 %add
+}
+
+; Leaf function with stack allocation and saving/restoring of non-volatile
+; registers.
+define i32 @foo4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = zext i32 %a to i64
+  br label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body
+  %1 = sext i32 %f to i64
+  br label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %for.body ]
+  %2 = add nsw i64 %indvars.iv22, %0
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %indvars.iv22
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next23 = add i64 %indvars.iv22, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next23 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, 128
+  br i1 %exitcond26, label %for.cond2.preheader, label %for.body
+
+for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %z1.018 = phi i32 [ 0, %for.cond2.preheader ], [ %add8, %for.body4 ]
+  %4 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %4
+  %5 = load i32* %arrayidx7, align 4, !tbaa !0
+  %add8 = add nsw i32 %5, %z1.018
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.end11, label %for.body4
+
+for.end11:                                        ; preds = %for.body4
+  ret i32 %add8
+}
+
+attributes #0 = { readnone "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+
+; CHECK:        .section        __LD,__compact_unwind,regular,debug
+; CHECK:        .quad   _foo1                   ; Range Start
+; CHECK:        .long   33554432                ; Compact Unwind Encoding: 0x2000000
+; CHECK:        .quad   _foo2                   ; Range Start
+; CHECK:        .long   33591296                ; Compact Unwind Encoding: 0x2009000
+; CHECK:        .quad   _foo3                   ; Range Start
+; CHECK:        .long   33570831                ; Compact Unwind Encoding: 0x200400f
+; CHECK:        .quad   _foo4                   ; Range Start
+; CHECK:        .long   33689616                ; Compact Unwind Encoding: 0x2021010

Added: llvm/trunk/test/CodeGen/ARM64/leaf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/leaf.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/leaf.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/leaf.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp	fp, lr, [sp, #-16]!
+; CHECK-NOT: mov	fp, sp
+; CHECK: nop
+; CHECK-NOT: mov	sp, fp
+; CHECK-NOT: ldp	fp, lr, [sp], #16
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/lit.local.cfg?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/lit.local.cfg (added)
+++ llvm/trunk/test/CodeGen/ARM64/lit.local.cfg Sat Mar 29 05:18:08 2014
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/CodeGen/ARM64/long-shift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/long-shift.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/long-shift.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/long-shift.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lslv  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lsrv  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lslv  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp   [[XREG_4]], #0
+; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lslv  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+  %shl = shl i128 %r, %s
+  ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: ashr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = ashr i128 %r, %s
+  ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: lshr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = lshr i128 %r, %s
+  ret i128 %shr
+}

Added: llvm/trunk/test/CodeGen/ARM64/memcpy-inline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/memcpy-inline.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/memcpy-inline.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/memcpy-inline.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+ at src = external global %struct.x
+ at dst = external global %struct.x
+
+ at .str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+ at .str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+ at .str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+ at .str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+ at .str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+ at .str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+ at spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+  ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #21587
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/memset-inline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/memset-inline.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/memset-inline.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/memset-inline.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/memset-to-bzero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/memset-to-bzero.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/memset-to-bzero.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/memset-to-bzero.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,101 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK: bzero
+define void @fct2(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK: bzero
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK: bzero
+define void @fct5(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK: bzero
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/movi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/movi.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/movi.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/movi.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK: test64_32_rot0
+; CHECK: orr x0, xzr, #0x700000007
+  ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK: test64_32_rot2
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+  ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK: test64_4_rot3
+; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
+  ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK: test32_32_rot16
+; CHECK: orr w0, wzr, #0xff0000
+  ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK: test32_2_rot1
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+  ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK: movz
+; CHECK: movz w0, #5
+  ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK: movz_3movk
+; CHECK:      movz x0, #5, lsl #48
+; CHECK-NEXT: movk x0, #4660, lsl #32
+; CHECK-NEXT: movk x0, #43981, lsl #16
+; CHECK-NEXT: movk x0, #22136
+  ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK: movz_movk_skip1
+; CHECK:      movz x0, #5, lsl #32
+; CHECK-NEXT: movk x0, #17185, lsl #16
+  ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK: movz_skip1_movk
+; CHECK:      movz x0, #34388, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK: movn
+; CHECK: movn x0, #41
+  ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK: movn_skip1_movk
+; CHECK:      movn x0, #41, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK: orr_movk1
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK: orr_movk2
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK: orr_movk3
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK: orr_movk4
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005
+  ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK: orr_movk5
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK: orr_movk6
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK: orr_movk7
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK: orr_movk8
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK: orr_movk9
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #65280
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK: orr_movk10
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK: orr_movk11
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #65535, lsl #32
+  ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK: orr_movk12
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK: orr_movk13
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK: g
+; CHECK: movz x0, #65535, lsl #48
+; CHECK: movk x0, #2
+entry:
+  ret i64 -281474976710654
+}

Added: llvm/trunk/test/CodeGen/ARM64/mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/mul.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/mul.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/mul.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i64 %a to i128
+  %tmp2 = zext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i64 %a to i128
+  %tmp2 = sext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = add i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}

Added: llvm/trunk/test/CodeGen/ARM64/neon-compare-instructions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/neon-compare-instructions.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/neon-compare-instructions.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/neon-compare-instructions.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+  %tst = icmp eq <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+  %tst = icmp sge <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+  %tst = icmp sgt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+  %tst = icmp sle <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+  %tst = icmp slt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+  %tst = fcmp oeq <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+  %tst = fcmp oge <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+  %tst = fcmp ogt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+  %tst = fcmp ole <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+  %tst = fcmp olt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}

Added: llvm/trunk/test/CodeGen/ARM64/patchpoint.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/patchpoint.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/patchpoint.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/patchpoint.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,163 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51966
+; CHECK-NEXT:  blr  x16
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51967
+; CHECK-NEXT:  blr  x16
+; CHECK:       ret
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK:       mov fp, sp
+; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       Ltmp
+; CHECK:       mov sp, fp
+; CHECK:       ret
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      str x{{.+}}, [sp]
+; CHECK-NEXT: mov  x0, x{{.+}}
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movz  x{{.+}}, #10
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

Added: llvm/trunk/test/CodeGen/ARM64/platform-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/platform-reg.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/platform-reg.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/platform-reg.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+ at var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+  %val = load volatile [30 x i64]* @var
+  store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/popcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/popcnt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/popcnt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/popcnt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK: fmov	s0, w0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov	w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/prefetch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/prefetch.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/prefetch.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/prefetch.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+ at a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+  ; CHECK: @test
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4, !tbaa !0
+  %tmp = bitcast i32* %j.addr to i8*
+  ; CHECK: prfum pldl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+  ; CHECK: prfum pldl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+  ; CHECK: prfum pldl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+  ; CHECK: prfum pldl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+  ; CHECK: prfum pstl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+  ; CHECK: prfum pstl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+  ; CHECK: prfum pstl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+  ; CHECK: prfum pstl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+  %add = add nsw i32 %tmp1, %i
+  %idxprom = sext i32 %add to i64
+  %tmp2 = load i32** @a, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+  %tmp3 = bitcast i32* %arrayidx to i8*
+
+  ; CHECK: prfm pldl1strm
+  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+  %tmp4 = load i32** @a, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+  %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+  ; CHECK: prfm pldl3keep
+  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+  %tmp6 = load i32** @a, align 8, !tbaa !3
+  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+  %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+  ; CHECK: prfm pldl2keep
+  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+  %tmp8 = load i32** @a, align 8, !tbaa !3
+  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+  %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm pldl1keep
+  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+  %tmp10 = load i32** @a, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+  %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+  %tmp12 = load i32** @a, align 8, !tbaa !3
+  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+  %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  %tmp14 = load i32** @a, align 8, !tbaa !3
+  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+  %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  %tmp16 = load i32** @a, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+  %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+  ; CHECK: prfm pstl1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}

Added: llvm/trunk/test/CodeGen/ARM64/promote-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/promote-const.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/promote-const.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/promote-const.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst at PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst at PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %add.i9 = add <16 x i8> %add.i, %mul.i
+  ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+  %add.i12 = add <16 x i8> %add.i, %ret2.0
+  ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+  ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+  ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+  %addr = bitcast i64* %storage to <1 x i80>*
+  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+  ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+  ret void
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/redzone.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/redzone.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/redzone.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/redzone.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  store i32 %add, i32* %x, align 4
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
+}

Added: llvm/trunk/test/CodeGen/ARM64/register-offset-addressing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/register-offset-addressing.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/register-offset-addressing.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/register-offset-addressing.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @t1(i16* %a, i64 %b) {
+; CHECK: t1
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}

Added: llvm/trunk/test/CodeGen/ARM64/register-pairing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/register-pairing.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/register-pairing.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/register-pairing.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+  ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/regress-f128csel-flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/regress-f128csel-flags.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/regress-f128csel-flags.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/regress-f128csel-flags.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double %l, double %r
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+    ret double %retval
+}

Added: llvm/trunk/test/CodeGen/ARM64/return-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/return-vector.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/return-vector.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/return-vector.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %tmp1 = load <2 x double>* %p, align 16
+  ret <2 x double> %tmp1
+}

Added: llvm/trunk/test/CodeGen/ARM64/returnaddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/returnaddr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/returnaddr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/returnaddr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, lr
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: ldr x[[REG:[0-9]+]], [fp]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/rev.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/rev.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/rev.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/rev.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,221 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+  %tmp1 = lshr i32 %X, 8
+  %X15 = bitcast i32 %X to i32
+  %tmp4 = shl i32 %X15, 8
+  %tmp2 = and i32 %tmp1, 16711680
+  %tmp5 = and i32 %tmp4, -16777216
+  %tmp9 = and i32 %tmp1, 255
+  %tmp13 = and i32 %tmp4, 65280
+  %tmp6 = or i32 %tmp5, %tmp2
+  %tmp10 = or i32 %tmp6, %tmp13
+  %tmp14 = or i32 %tmp10, %tmp9
+  ret i32 %tmp14
+}
+
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK: rev16 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 16
+  %2 = shl i64 %0, 48
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 32
+  %2 = shl i64 %0, 32
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+	ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/rounding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/rounding.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/rounding.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/rounding.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+  %call = tail call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+  %call = tail call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+  %call = tail call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+  %call = tail call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }

Added: llvm/trunk/test/CodeGen/ARM64/scaled_iv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/scaled_iv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/scaled_iv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/scaled_iv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/scvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/scvt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/scvt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/scvt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = uitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = sitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = uitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to double
+  ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+  %conv = uitofp i128 %in to float
+  ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+  %conv = sitofp i128 %in to double
+  ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+  %conv = uitofp i128 %in to fp128
+  ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+  %conv = fptosi float %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+  %conv = fptoui double %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+  %conv = fptosi fp128 %in to i128
+  ret i128 %conv
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/shifted-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/shifted-sext.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/shifted-sext.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/shifted-sext.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 4
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 4
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 8
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 8
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 8
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 8
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #56, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 8
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 8
+  ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv2 = zext i16 %inc to i32
+  %shl = shl nuw i32 %conv2, 16
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 16
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #48, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 16
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 16
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv2 = zext i32 %inc to i64
+  %shl = shl nuw i64 %conv2, 32
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 32
+  ret i64 %shr
+}

Added: llvm/trunk/test/CodeGen/ARM64/simd-scalar-to-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/simd-scalar-to-vector.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/simd-scalar-to-vector.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/simd-scalar-to-vector.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b	h0, v0
+; CHECK: rshrn.8b	v0, v0, #4
+; CHECK: dup.16b	v0, v0[0]
+; CHECK: ret
+  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/simplest-elf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/simplest-elf.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/simplest-elf.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/simplest-elf.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+  ret void
+}
+
+  ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK:     ret
+
+  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+  ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF:    ret

Added: llvm/trunk/test/CodeGen/ARM64/sincos.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/sincos.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/sincos.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/sincos.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: bl ___sincosf_stret
+; CHECK: fadd s0, s0, s1
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: bl ___sincos_stret
+; CHECK: fadd d0, d0, d1
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly

Added: llvm/trunk/test/CodeGen/ARM64/sitofp-combine-chains.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/sitofp-combine-chains.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/sitofp-combine-chains.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/sitofp-combine-chains.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o -  %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+ at var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+  ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+  %val = load i32* @var, align 4
+  store i32 0, i32* @var, align 4
+
+  %fltval = sitofp i32 %val to float
+  ret float %fltval
+}

Added: llvm/trunk/test/CodeGen/ARM64/sli-sri-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/sli-sri-opt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/sli-sri-opt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/sli-sri-opt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,41 @@
+; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/smaxv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/smaxv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/smaxv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/smaxv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+

Added: llvm/trunk/test/CodeGen/ARM64/sminv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/sminv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/sminv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/sminv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+

Added: llvm/trunk/test/CodeGen/ARM64/spill-lr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/spill-lr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/spill-lr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/spill-lr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+ at bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = bitcast [128 x i32]* %stack to i8*
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+  store i32 %b, i32* %arrayidx, align 4
+  %1 = load volatile i32* @bar, align 4
+  %2 = load volatile i32* @bar, align 4
+  %3 = load volatile i32* @bar, align 4
+  %4 = load volatile i32* @bar, align 4
+  %5 = load volatile i32* @bar, align 4
+  %6 = load volatile i32* @bar, align 4
+  %7 = load volatile i32* @bar, align 4
+  %8 = load volatile i32* @bar, align 4
+  %9 = load volatile i32* @bar, align 4
+  %10 = load volatile i32* @bar, align 4
+  %11 = load volatile i32* @bar, align 4
+  %12 = load volatile i32* @bar, align 4
+  %13 = load volatile i32* @bar, align 4
+  %14 = load volatile i32* @bar, align 4
+  %15 = load volatile i32* @bar, align 4
+  %16 = load volatile i32* @bar, align 4
+  %17 = load volatile i32* @bar, align 4
+  %18 = load volatile i32* @bar, align 4
+  %19 = load volatile i32* @bar, align 4
+  %20 = load volatile i32* @bar, align 4
+  %idxprom1 = sext i32 %c to i64
+  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+  %21 = load i32* %arrayidx2, align 4
+  %factor = mul i32 %h, -2
+  %factor67 = mul i32 %g, -2
+  %factor68 = mul i32 %f, -2
+  %factor69 = mul i32 %e, -2
+  %factor70 = mul i32 %d, -2
+  %factor71 = mul i32 %c, -2
+  %factor72 = mul i32 %b, -2
+  %sum = add i32 %2, %1
+  %sum73 = add i32 %sum, %3
+  %sum74 = add i32 %sum73, %4
+  %sum75 = add i32 %sum74, %5
+  %sum76 = add i32 %sum75, %6
+  %sum77 = add i32 %sum76, %7
+  %sum78 = add i32 %sum77, %8
+  %sum79 = add i32 %sum78, %9
+  %sum80 = add i32 %sum79, %10
+  %sum81 = add i32 %sum80, %11
+  %sum82 = add i32 %sum81, %12
+  %sum83 = add i32 %sum82, %13
+  %sum84 = add i32 %sum83, %14
+  %sum85 = add i32 %sum84, %15
+  %sum86 = add i32 %sum85, %16
+  %sum87 = add i32 %sum86, %17
+  %sum88 = add i32 %sum87, %18
+  %sum89 = add i32 %sum88, %19
+  %sum90 = add i32 %sum89, %20
+  %sub15 = sub i32 %21, %sum90
+  %sub16 = add i32 %sub15, %factor
+  %sub17 = add i32 %sub16, %factor67
+  %sub18 = add i32 %sub17, %factor68
+  %sub19 = add i32 %sub18, %factor69
+  %sub20 = add i32 %sub19, %factor70
+  %sub21 = add i32 %sub20, %factor71
+  %add = add i32 %sub21, %factor72
+  ret i32 %add
+}

Added: llvm/trunk/test/CodeGen/ARM64/spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/spill.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/spill.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/spill.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+  %x = load <4 x float>* %p, align 16
+  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  store <4 x float> %x, <4 x float>* %p, align 16
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/st1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/st1.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/st1.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/st1.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,628 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK: st2_8b
+; CHECK st2.8b
+	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK: st3_8b
+; CHECK st3.8b
+	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK: st4_8b
+; CHECK st4.8b
+	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK: st2_16b
+; CHECK st2.16b
+	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK: st3_16b
+; CHECK st3.16b
+	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK: st4_16b
+; CHECK st4.16b
+	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK: st2_4h
+; CHECK st2.4h
+	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK: st3_4h
+; CHECK st3.4h
+	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK: st4_4h
+; CHECK st4.4h
+	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK: st2_8h
+; CHECK st2.8h
+	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK: st3_8h
+; CHECK st3.8h
+	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK: st4_8h
+; CHECK st4.8h
+	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK: st2_2s
+; CHECK st2.2s
+	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK: st3_2s
+; CHECK st3.2s
+	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK: st4_2s
+; CHECK st4.2s
+	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK: st2_4s
+; CHECK st2.4s
+	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK: st3_4s
+; CHECK st3.4s
+	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK: st4_4s
+; CHECK st4.4s
+	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK: st2_1d
+; CHECK st1.2d
+	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK: st3_1d
+; CHECK st1.3d
+	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK: st4_1d
+; CHECK st1.4d
+	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK: st2_2d
+; CHECK st2.2d
+	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK: st3_2d
+; CHECK st2.3d
+	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK: st4_2d
+; CHECK st2.4d
+	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/stack-no-frame.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/stack-no-frame.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/stack-no-frame.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/stack-no-frame.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+ at global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+  %local = alloca [20 x i64]
+  %val = load volatile [20 x i64]* @global, align 8
+  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+  %val2 = load volatile [20 x i64]* %local, align 8
+  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/stackmap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/stackmap.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/stackmap.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/stackmap.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,281 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-NEXT:   .long   0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; CHECK-NEXT:   .long _constantargs
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _osrinline
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _osrcold
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _propertyRead
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _propertyWrite
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _jsVoidCall
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _jsIntCall
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _spilledValue
+; CHECK-NEXT:   .long 160
+; CHECK-NEXT:   .long _spilledStackMapValue
+; CHECK-NEXT:   .long 128
+; CHECK-NEXT:   .long _liveConstant
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _clobberLR
+; CHECK-NEXT:   .long 112
+; Num LargeConstants
+; CHECK-NEXT:   .long   2
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+; Num Callsites
+; CHECK-NEXT:   .long   11
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  29
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

Added: llvm/trunk/test/CodeGen/ARM64/stacksave.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/stacksave.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/stacksave.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/stacksave.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+  %savedstack = call i8* @llvm.stacksave() nounwind
+  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  call void @llvm.stackrestore(i8* %savedstack) nounwind
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/stp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/stp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/stp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/stp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+  store i32 %a, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  store i32 %b, i32* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+  store i64 %a, i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  store i64 %b, i64* %add.ptr, align 8
+  ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+  store float %a, float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  store float %b, float* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+  store double %a, double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  store double %b, double* %add.ptr, align 8
+  ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %p, i32 -1
+  store i32 %a, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %p, i32 -2
+  store i32 %b, i32* %p2, align 2
+  ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %p, i32 -1
+  store i64 %a, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %p, i32 -2
+  store i64 %b, i64* %p2, align 2
+  ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %p, i32 -1
+  store float %a, float* %p1, align 2
+  %p2 = getelementptr inbounds float* %p, i32 -2
+  store float %b, float* %p2, align 2
+  ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %p, i32 -1
+  store double %a, double* %p1, align 2
+  %p2 = getelementptr inbounds double* %p, i32 -2
+  store double %b, double* %p2, align 2
+  ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+  %p21 = bitcast i32* %p to <4 x i32>*
+  store <4 x i32> %p20, <4 x i32>* %p21, align 4
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/strict-align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/strict-align.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/strict-align.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/strict-align.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+  %tmp = load i32* %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
+; CHECK-STRICT:	ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+  %tmp = load i64* %p, align 4
+  ret i64 %tmp
+}

Added: llvm/trunk/test/CodeGen/ARM64/stur.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/stur.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/stur.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/stur.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: 	stur	w1, [x0, #-4]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i32
+  %ptr = getelementptr inbounds i32* %p, i64 -1
+  store i32 %tmp1, i32* %ptr, align 4
+  ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i64 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i64 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i32 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i32 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
+; CHECK-NEXT: ret
+  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+  %val = bitcast i64* %B to i8*
+  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str     d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str     d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p
+  ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 2
+  ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/subvector-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/subvector-extend.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/subvector-extend.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/subvector-extend.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h  v1, v0, #0
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h  v1, v0, #0
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}

Added: llvm/trunk/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK:	.section	__TEXT,__literal16,16byte_literals
+; CHECK:	.align	4
+; CHECK:lCPI0_0:
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.section __TEXT,__text,regular,pure_instructions
+; CHECK:	.globl	_foo
+; CHECK:	.align	2
+; CHECK:_foo:                                   ; @foo
+; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0 at PAGE
+; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0 at PAGEOFF]
+; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
+; CHECK:	ret
+
+  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %val
+}

Added: llvm/trunk/test/CodeGen/ARM64/tbl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/tbl.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/tbl.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/tbl.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+

Added: llvm/trunk/test/CodeGen/ARM64/this-return.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/this-return.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/this-return.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/this-return.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  ret %struct.E* %this
+}

Added: llvm/trunk/test/CodeGen/ARM64/tls-darwin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/tls-darwin.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/tls-darwin.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/tls-darwin.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+ at var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var at TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var at TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+  %val = load i8* @var, align 1
+  ret i8 %val
+}

Added: llvm/trunk/test/CodeGen/ARM64/tls-dynamic-together.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/tls-dynamic-together.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/tls-dynamic-together.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/tls-dynamic-together.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+ at general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}

Added: llvm/trunk/test/CodeGen/ARM64/tls-dynamics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/tls-dynamics.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/tls-dynamics.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/tls-dynamics.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+ at general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+}
+
+ at local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+ at local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK-LABEL: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/tls-execs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/tls-execs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/tls-execs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/tls-execs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+ at initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK-LABEL: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK-LABEL: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+ at local_exec_var = thread_local(localexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK-LABEL: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK-LABEL: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}

Added: llvm/trunk/test/CodeGen/ARM64/trap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/trap.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/trap.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/trap.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+declare void @llvm.trap() nounwind

Added: llvm/trunk/test/CodeGen/ARM64/trn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/trn.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/trn.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/trn.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = fadd <2 x float> %tmp3, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}

Added: llvm/trunk/test/CodeGen/ARM64/trunc-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/trunc-store.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/trunc-store.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/trunc-store.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+  %tmp = trunc <8 x i16> %arg to <8 x i8>
+  store <8 x i8> %tmp, <8 x i8>* %p, align 8
+  ret void
+}
+
+ at zptr8 = common global i8* null, align 8
+ at zptr16 = common global i16* null, align 8
+ at zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32 at GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32 at GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i32** @zptr32, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i32
+  store i32 %tmp, i32* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16 at GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16 at GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i16** @zptr16, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i16
+  store i16 %tmp, i16* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8 at GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8 at GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i8** @zptr8, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i8
+  store i8 %tmp, i8* %arrayidx9, align 4
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/umaxv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/umaxv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/umaxv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/umaxv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/uminv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/uminv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/uminv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/uminv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/umov.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/umov.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/umov.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/umov.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: umov.b w0, v0[3]
+; CHECK-NEXT: ret
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: umov.h w0, v0[2]
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: umov.s w0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: umov.d x0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 1
+  ret i64 %vecext
+}

Added: llvm/trunk/test/CodeGen/ARM64/unaligned_ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/unaligned_ldst.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/unaligned_ldst.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/unaligned_ldst.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+  %tmp1 = bitcast i8* %b to i64*
+  %tmp2 = bitcast i8* %a to i64*
+  %tmp3 = load i64* %tmp1, align 1
+  store i64 %tmp3, i64* %tmp2, align 1
+  ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i32*
+  %tmp2 = bitcast i8* %a to i32*
+  %tmp3 = load i32* %tmp1, align 1
+  store i32 %tmp3, i32* %tmp2, align 1
+  ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i16*
+  %tmp2 = bitcast i8* %a to i16*
+  %tmp3 = load i16* %tmp1, align 1
+  store i16 %tmp3, i16* %tmp2, align 1
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/uzp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/uzp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/uzp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/uzp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}

Added: llvm/trunk/test/CodeGen/ARM64/vaargs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vaargs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vaargs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vaargs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+  %argp = alloca i8*, align 8
+  %argp1 = bitcast i8** %argp to i8*
+  call void @llvm.va_start(i8* %argp1)
+  %0 = va_arg i8** %argp, i32
+  %1 = va_arg i8** %argp, float
+  call void @llvm.va_end(i8* %argp1)
+  ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind

Added: llvm/trunk/test/CodeGen/ARM64/vabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vabs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vabs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vabs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,796 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+  %load1 = load <16 x i8>* %A
+  %load2 = load <16 x i8>* %B
+  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  ret <1 x i64> %abs
+}
+
+declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  ret double %vabd.i
+}
+
+declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}

Added: llvm/trunk/test/CodeGen/ARM64/vadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vadd.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vadd.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vadd.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}

Added: llvm/trunk/test/CodeGen/ARM64/vaddlv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vaddlv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vaddlv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vaddlv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+

Added: llvm/trunk/test/CodeGen/ARM64/vaddv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vaddv.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vaddv.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vaddv.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,233 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = and i32 %vaddv.i, 511 ; 0x1ff
+  ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+  ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)

Added: llvm/trunk/test/CodeGen/ARM64/variadic-aapcs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/variadic-aapcs.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/variadic-aapcs.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/variadic-aapcs.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+ at var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+ at second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+  ret void
+; CHECK: ret
+}

Added: llvm/trunk/test/CodeGen/ARM64/vbitwise.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vbitwise.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vbitwise.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vbitwise.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+  %0 = bitcast i8* %src to <16 x i8>*
+  %1 = load <16 x i8>* %0, align 16
+  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+  %2 = bitcast <16 x i8> %and.i to <8 x i16>
+  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = or <8 x i16> %2, %vshl_n
+  %4 = bitcast <8 x i16> %3 to <4 x i32>
+  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+  %5 = or <4 x i32> %4, %vshl_n8
+  %6 = bitcast <4 x i32> %5 to <16 x i8>
+  %7 = bitcast i8* %dest to <16 x i8>*
+  store <16 x i8> %6, <16 x i8>* %7, align 16
+  ret void
+}

Added: llvm/trunk/test/CodeGen/ARM64/vclz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vclz.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vclz.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vclz.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcmp.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcmp.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcmp.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,227 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+  %tmp = fcmp olt <4 x float> %a, zeroinitializer
+  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+  ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %commonbits = and <8 x i8> %tmp1, %tmp2
+  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+  %tmp1 = load <16 x i8>* %A
+  %tmp2 = load <16 x i8>* %B
+  %commonbits = and <16 x i8> %tmp1, %tmp2
+  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+  %res = sext <16 x i1> %mask to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %commonbits = and <4 x i16> %tmp1, %tmp2
+  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %commonbits = and <8 x i16> %tmp1, %tmp2
+  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %commonbits = and <2 x i32> %tmp1, %tmp2
+  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %commonbits = and <4 x i32> %tmp1, %tmp2
+  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i64>* %B
+  %commonbits = and <2 x i64> %tmp1, %tmp2
+  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+  %tst = fcmp oeq <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+  %tst = fcmp oge <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+  %tst = fcmp ole <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+  %tst = fcmp ogt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+  %tst = fcmp olt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}

Added: llvm/trunk/test/CodeGen/ARM64/vcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcnt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcnt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcnt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vcombine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcombine.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcombine.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcombine.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+  %0 = bitcast <16 x i8> %q0 to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <16 x i8> %q1 to <2 x i64>
+  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+  ret <16 x i8> %2
+}

Added: llvm/trunk/test/CodeGen/ARM64/vcvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvt.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvt.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvt.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+  %Tr53 = fptrunc <8 x double> undef to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* undef
+  ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+  %A = load <8 x float>* undef
+  %Tr53 = fpext <8 x float> %A to <8 x double>
+  store <8 x double> %Tr53, <8 x double>* undef
+  ret void
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vcvt_f.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvt_f.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvt_f.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvt_f.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+  %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl	v0.2d, v0.2s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2	v0.2d, v0.4s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+  ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+  %cvt = fptrunc <2 x double> %v to <2 x float>
+  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+  ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+  ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+  ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov w0, s[[HALFVAL]]
+
+  %res = call i16 @llvm.convert.to.fp16(float %in)
+  ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+  %res = call float @llvm.convert.from.fp16(i16 %in)
+  ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1

Added: llvm/trunk/test/CodeGen/ARM64/vcvt_f32_su32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvt_f32_su32.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvt_f32_su32.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvt_f32_su32.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s  v0, v0
+; CHECK: ret
+
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl  v0.4s, v0.4h
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2  v0.4s, v0.8h
+; CHECK-NEXT: ret
+  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn  v0.4h, v0.4s
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vcvt_n.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvt_n.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvt_n.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvt_n.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vcvt_su32_f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvt_su32_f32.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvt_su32_f32.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvt_su32_f32.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+

Added: llvm/trunk/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vcvtxd_f32_f64.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vcvtxd_f32_f64.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vcvtxd_f32_f64.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone

Added: llvm/trunk/test/CodeGen/ARM64/vecCmpBr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vecCmpBr.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vecCmpBr.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vecCmpBr.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }

Added: llvm/trunk/test/CodeGen/ARM64/vecFold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/vecFold.ll?rev=205090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/vecFold.ll (added)
+++ llvm/trunk/test/CodeGen/ARM64/vecFold.ll Sat Mar 29 05:18:08 2014
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: addhn2.8h	v0, v2, v3
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: shrn2.8h	v0, v2, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: 	raddhn.4h	v0, v0, v1
+; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b	v0, v0, #5
+; CHECK-NEXT: rshrn2.16b	v0, v2, #6
+; CHECK-NEXT: ret
+  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b	v0, v0, v1
+; CHECK: rsubhn2.16b	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64>