[llvm] r239514 - [AArch64] Match interleaved memory accesses into ldN/stN instructions.

Thu Jun 11 07:51:49 PDT 2015

I think the test file needs to specify a triple rather than just an arch.
The test is failing for me on Darwin.

Does this mean there are no Darwin llvm buildbots currently online??

$ ./llvm-lit
../../llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll -v
llvm-lit: lit.cfg:279: note: Did not find llvm-go in
/Users/spatel/myllvm/build/./bin
-- Testing: 1 tests, 1 threads --
FAIL: LLVM :: CodeGen/AArch64/aarch64-interleaved-accesses.ll (1 of 1)
******************** TEST 'LLVM ::
CodeGen/AArch64/aarch64-interleaved-accesses.ll' FAILED ********************
Script:
--
llc -march=aarch64 -aarch64-interleaved-access-opt=true <
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
| FileCheck
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
--
Exit Code: 1

Command Output (stderr):
--
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:4:10:
error: expected string not found in input
; CHECK: ld2 { v0.8b, v1.8b }, [x0]
         ^
<stdin>:5:16: note: scanning from here
_load_factor2: ; @load_factor2
               ^
<stdin>:8:2: note: possible intended match here
 ld2.8b { v0, v1 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:14:10:
error: expected string not found in input
; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
         ^
<stdin>:15:15: note: scanning from here
_load_delat3: ; @load_delat3
              ^
<stdin>:18:2: note: possible intended match here
 ld3.4s { v0, v1, v2 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:25:10:
error: expected string not found in input
; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
         ^
<stdin>:25:16: note: scanning from here
_load_factor4: ; @load_factor4
               ^
<stdin>:28:2: note: possible intended match here
 ld4.4s { v0, v1, v2, v3 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:36:10:
error: expected string not found in input
; CHECK: st2 { v0.8b, v1.8b }, [x0]
         ^
<stdin>:35:17: note: scanning from here
_store_factor2: ; @store_factor2
                ^
<stdin>:38:2: note: possible intended match here
 st2.8b { v0, v1 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:44:10:
error: expected string not found in input
; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
         ^
<stdin>:44:17: note: scanning from here
_store_factor3: ; @store_factor3
                ^
<stdin>:47:7: note: possible intended match here
 st3.4s { v0, v1, v2 }, [x0]
      ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:55:10:
error: expected string not found in input
; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
         ^
<stdin>:53:17: note: scanning from here
_store_factor4: ; @store_factor4
                ^
<stdin>:56:7: note: possible intended match here
 st4.4s { v0, v1, v2, v3 }, [x0]
      ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:69:10:
error: expected string not found in input
; CHECK: ld2 { v0.2d, v1.2d }, [x0]
         ^
<stdin>:62:23: note: scanning from here
_load_ptrvec_factor2: ; @load_ptrvec_factor2
                      ^
<stdin>:65:6: note: possible intended match here
 ld2.2d { v0, v1 }, [x0]
     ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:78:10:
error: expected string not found in input
; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
         ^
<stdin>:71:23: note: scanning from here
_load_ptrvec_factor3: ; @load_ptrvec_factor3
                      ^
<stdin>:74:6: note: possible intended match here
 ld3.2d { v0, v1, v2 }, [x0]
     ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:90:10:
error: expected string not found in input
; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
         ^
<stdin>:82:23: note: scanning from here
_load_ptrvec_factor4: ; @load_ptrvec_factor4
                      ^
<stdin>:85:6: note: possible intended match here
 ld4.2d { v0, v1, v2, v3 }, [x0]
     ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:102:10:
error: expected string not found in input
; CHECK: st2 { v0.2d, v1.2d }, [x0]
         ^
<stdin>:93:24: note: scanning from here
_store_ptrvec_factor2: ; @store_ptrvec_factor2
                       ^
<stdin>:96:2: note: possible intended match here
 st2.2d { v0, v1 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:111:10:
error: expected string not found in input
; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
         ^
<stdin>:102:24: note: scanning from here
_store_ptrvec_factor3: ; @store_ptrvec_factor3
                       ^
<stdin>:105:2: note: possible intended match here
 st3.2d { v0, v1, v2 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:122:10:
error: expected string not found in input
; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
         ^
<stdin>:111:24: note: scanning from here
_store_ptrvec_factor4: ; @store_ptrvec_factor4
                       ^
<stdin>:114:2: note: possible intended match here
 st4.2d { v0, v1, v2, v3 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:136:10:
error: expected string not found in input
; CHECK: ld2 { v0.4s, v1.4s }, [x0]
         ^
<stdin>:120:27: note: scanning from here
_load_undef_mask_factor2: ; @load_undef_mask_factor2
                          ^
<stdin>:123:2: note: possible intended match here
 ld2.4s { v0, v1 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:147:10:
error: expected string not found in input
; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
         ^
<stdin>:130:27: note: scanning from here
_load_undef_mask_factor3: ; @load_undef_mask_factor3
                          ^
<stdin>:133:2: note: possible intended match here
 ld3.4s { v0, v1, v2 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:158:10:
error: expected string not found in input
; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
         ^
<stdin>:140:27: note: scanning from here
_load_undef_mask_factor4: ; @load_undef_mask_factor4
                          ^
<stdin>:143:2: note: possible intended match here
 ld4.4s { v0, v1, v2, v3 }, [x0]
 ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:169:10:
error: expected string not found in input
; CHECK: st2 { v0.4s, v1.4s }, [x0]
         ^
<stdin>:150:28: note: scanning from here
_store_undef_mask_factor2: ; @store_undef_mask_factor2
                           ^
<stdin>:153:7: note: possible intended match here
 st2.4s { v0, v1 }, [x0]
      ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:178:10:
error: expected string not found in input
; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
         ^
<stdin>:159:28: note: scanning from here
_store_undef_mask_factor3: ; @store_undef_mask_factor3
                           ^
<stdin>:162:7: note: possible intended match here
 st3.4s { v0, v1, v2 }, [x0]
      ^
/Users/spatel/myllvm/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll:189:10:
error: expected string not found in input
; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
         ^
<stdin>:168:28: note: scanning from here
_store_undef_mask_factor4: ; @store_undef_mask_factor4
                           ^
<stdin>:171:7: note: possible intended match here
 st4.4s { v0, v1, v2, v3 }, [x0]
      ^

--

********************
Testing Time: 0.12s
********************
Failing Tests (1):
    LLVM :: CodeGen/AArch64/aarch64-interleaved-accesses.ll

On Thu, Jun 11, 2015 at 3:05 AM, Hao Liu <Hao.Liu at arm.com> wrote:

> Author: haoliu
> Date: Thu Jun 11 04:05:02 2015
> New Revision: 239514
>
> URL: http://llvm.org/viewvc/llvm-project?rev=239514&view=rev
> Log:
> [AArch64] Match interleaved memory accesses into ldN/stN instructions.
>
> Add a pass AArch64InterleavedAccess to identify and match interleaved
> memory accesses. This pass transforms an interleaved load/store into
> ldN/stN intrinsic. As Loop Vectorizor disables optimization on interleaved
> accesses by default, this optimization is also disabled by default. To
> enable it by "-aarch64-interleaved-access-opt=true"
>
> E.g. Transform an interleaved load (Factor = 2):
>        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
>        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even
> elements
>        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
>      Into:
>        %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
>        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
>        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
>
> E.g. Transform an interleaved store (Factor = 2):
>        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ; Interleaved
> vec
>        store <8 x i32> %i.vec, <8 x i32>* %ptr
>      Into:
>        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
>        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
>        call void aarch64.neon.st2(%v0, %v1, %ptr)
>
>
> Added:
>     llvm/trunk/lib/Target/AArch64/AArch64InterleavedAccess.cpp
>     llvm/trunk/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
> Modified:
>     llvm/trunk/lib/Target/AArch64/AArch64.h
>     llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
>     llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
>     llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
>     llvm/trunk/lib/Target/AArch64/CMakeLists.txt
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>
> Modified: llvm/trunk/lib/Target/AArch64/AArch64.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.h?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/AArch64.h (original)
> +++ llvm/trunk/lib/Target/AArch64/AArch64.h Thu Jun 11 04:05:02 2015
> @@ -38,6 +38,7 @@ FunctionPass *createAArch64LoadStoreOpti
>  ModulePass *createAArch64PromoteConstantPass();
>  FunctionPass *createAArch64ConditionOptimizerPass();
>  FunctionPass *createAArch64AddressTypePromotionPass();
> +FunctionPass *createAArch64InterleavedAccessPass();
>  FunctionPass *createAArch64A57FPLoadBalancing();
>  FunctionPass *createAArch64A53Fix835769();
>
>
> Added: llvm/trunk/lib/Target/AArch64/AArch64InterleavedAccess.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InterleavedAccess.cpp?rev=239514&view=auto
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/AArch64InterleavedAccess.cpp (added)
> +++ llvm/trunk/lib/Target/AArch64/AArch64InterleavedAccess.cpp Thu Jun 11
> 04:05:02 2015
> @@ -0,0 +1,391 @@
> +//=--------------------- AArch64InterleavedAccess.cpp
> ----------------------==//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
>
> +//===----------------------------------------------------------------------===//
> +//
> +// This file implements the AArch64InterleavedAccess pass, which
> identifies
> +// interleaved memory accesses and Transforms them into an AArch64 ldN/stN
> +// intrinsics (N = 2, 3, 4).
> +//
> +// An interleaved load reads data from memory into several vectors, with
> +// DE-interleaving the data on factor. An interleaved store writes several
> +// vectors to memory with RE-interleaving the data on factor. The
> interleave
> +// factor is equal to the number of vectors. AArch64 backend supports
> interleave
> +// factor of 2, 3 and 4.
> +//
> +// E.g. Transform an interleaved load (Factor = 2):
> +//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
> +//        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even
> elements
> +//        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd
> elements
> +//      Into:
> +//        %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
> +//        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
> +//        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
> +//
> +// E.g. Transform an interleaved store (Factor = 2):
> +//        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ;
> Interleaved vec
> +//        store <8 x i32> %i.vec, <8 x i32>* %ptr
> +//      Into:
> +//        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
> +//        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
> +//        call void aarch64.neon.st2(%v0, %v1, %ptr)
> +//
>
> +//===----------------------------------------------------------------------===//
> +
> +#include "AArch64.h"
> +#include "llvm/ADT/SetVector.h"
> +#include "llvm/Analysis/TargetTransformInfo.h"
> +#include "llvm/IR/InstIterator.h"
> +#include "llvm/IR/IRBuilder.h"
> +#include "llvm/IR/Module.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/MathExtras.h"
> +
> +using namespace llvm;
> +
> +#define DEBUG_TYPE "aarch64-interleaved-access"
> +
> +static const unsigned MIN_FACTOR = 2;
> +static const unsigned MAX_FACTOR = 4;
> +
> +namespace llvm {
> +static void initializeAArch64InterleavedAccessPass(PassRegistry &);
> +}
> +
> +namespace {
> +
> +class AArch64InterleavedAccess : public FunctionPass {
> +
> +public:
> +  static char ID;
> +  AArch64InterleavedAccess() : FunctionPass(ID) {
> +
> initializeAArch64InterleavedAccessPass(*PassRegistry::getPassRegistry());
> +  }
> +
> +  const char *getPassName() const override {
> +    return "AArch64 Interleaved Access Pass";
> +  }
> +
> +  bool runOnFunction(Function &F) override;
> +
> +private:
> +  const DataLayout *DL;
> +  Module *M;
> +
> +  /// \brief Transform an interleaved load into ldN intrinsic.
> +  bool matchInterleavedLoad(ShuffleVectorInst *SVI,
> +                            SmallSetVector<Instruction *, 32> &DeadInsts);
> +
> +  /// \brief Transform an interleaved store into stN intrinsic.
> +  bool matchInterleavedStore(ShuffleVectorInst *SVI,
> +                             SmallSetVector<Instruction *, 32>
> &DeadInsts);
> +};
> +} // end anonymous namespace.
> +
> +char AArch64InterleavedAccess::ID = 0;
> +
> +INITIALIZE_PASS_BEGIN(AArch64InterleavedAccess, DEBUG_TYPE,
> +                      "AArch64 interleaved access Pass", false, false)
> +INITIALIZE_PASS_END(AArch64InterleavedAccess, DEBUG_TYPE,
> +                    "AArch64 interleaved access Pass", false, false)
> +
> +FunctionPass *llvm::createAArch64InterleavedAccessPass() {
> +  return new AArch64InterleavedAccess();
> +}
> +
> +/// \brief Get a ldN/stN intrinsic according to the Factor (2, 3, or 4).
> +static Intrinsic::ID getLdNStNIntrinsic(unsigned Factor, bool IsLoad) {
> +  static const Intrinsic::ID LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
> +                                           Intrinsic::aarch64_neon_ld3,
> +                                           Intrinsic::aarch64_neon_ld4};
> +  static const Intrinsic::ID StoreInt[3] = {Intrinsic::aarch64_neon_st2,
> +                                            Intrinsic::aarch64_neon_st3,
> +                                            Intrinsic::aarch64_neon_st4};
> +
> +  assert(Factor >= MIN_FACTOR && Factor <= MAX_FACTOR &&
> +         "Invalid interleave factor");
> +
> +  if (IsLoad)
> +    return LoadInt[Factor - 2];
> +  else
> +    return StoreInt[Factor - 2];
> +}
> +
> +/// \brief Check if the mask is a DE-interleave mask of the given factor
> +/// \p Factor like:
> +///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
> +static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned
> Factor,
> +                                       unsigned &Index) {
> +  // Check all potential start indices from 0 to (Factor - 1).
> +  for (Index = 0; Index < Factor; Index++) {
> +    unsigned i = 0;
> +
> +    // Check that elements are in ascending order by Factor.
> +    for (; i < Mask.size(); i++)
> +      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i *
> Factor)
> +        break;
> +
> +    if (i == Mask.size())
> +      return true;
> +  }
> +
> +  return false;
> +}
> +
> +/// \brief Check if the mask is a DE-interleave mask for an interleaved
> load.
> +///
> +/// E.g. DE-interleave masks (Factor = 2) could be:
> +///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
> +///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
> +static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
> +                               unsigned &Index) {
> +  unsigned NumElts = Mask.size();
> +  if (NumElts < 2)
> +    return false;
> +
> +  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
> +    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
> +      return true;
> +
> +  return false;
> +}
> +
> +/// \brief Check if the given mask \p Mask is RE-interleaved mask of the
> given
> +/// factor \p Factor.
> +///
> +/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts +
> 1, ...>
> +static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned
> Factor) {
> +  unsigned NumElts = Mask.size();
> +  if (NumElts % Factor)
> +    return false;
> +
> +  unsigned NumSubElts = NumElts / Factor;
> +  if (!isPowerOf2_32(NumSubElts))
> +    return false;
> +
> +  for (unsigned i = 0; i < NumSubElts; i++)
> +    for (unsigned j = 0; j < Factor; j++)
> +      if (Mask[i * Factor + j] >= 0 &&
> +          static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts +
> i)
> +        return false;
> +
> +  return true;
> +}
> +
> +/// \brief Check if the mask is RE-interleave mask for an interleaved
> store.
> +///
> +/// E.g. The RE-interleave mask (Factor = 2) could be:
> +///     <0, 4, 1, 5, 2, 6, 3, 7>
> +static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
> +  if (Mask.size() < 4)
> +    return false;
> +
> +  // Check potential Factors and return true if find a factor for the
> mask.
> +  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
> +    if (isReInterleaveMaskOfFactor(Mask, Factor))
> +      return true;
> +
> +  return false;
> +}
> +
> +/// \brief Get a mask consisting of sequential integers starting from \p
> Start.
> +///
> +/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
> +                                   unsigned NumElts) {
> +  SmallVector<Constant *, 16> Mask;
> +  for (unsigned i = 0; i < NumElts; i++)
> +    Mask.push_back(Builder.getInt32(Start + i));
> +
> +  return ConstantVector::get(Mask);
> +}
> +
> +bool AArch64InterleavedAccess::matchInterleavedLoad(
> +    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts)
> {
> +  if (DeadInsts.count(SVI))
> +    return false;
> +
> +  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
> +  if (!LI || !LI->isSimple() || !isa<UndefValue>(SVI->getOperand(1)))
> +    return false;
> +
> +  SmallVector<ShuffleVectorInst *, 4> Shuffles;
> +
> +  // Check if all users of this load are shufflevectors.
> +  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
> +    ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(*UI);
> +    if (!SV)
> +      return false;
> +
> +    Shuffles.push_back(SV);
> +  }
> +
> +  // Check if the type of the first shuffle is legal.
> +  VectorType *VecTy = Shuffles[0]->getType();
> +  unsigned TypeSize = DL->getTypeAllocSizeInBits(VecTy);
> +  if (TypeSize != 64 && TypeSize != 128)
> +    return false;
> +
> +  // Check if the mask of the first shuffle is strided and get the start
> index.
> +  unsigned Factor, Index;
> +  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
> +    return false;
> +
> +  // Holds the corresponding index for each strided shuffle.
> +  SmallVector<unsigned, 4> Indices;
> +  Indices.push_back(Index);
> +
> +  // Check if other shufflevectors are of the same type and factor
> +  for (unsigned i = 1; i < Shuffles.size(); i++) {
> +    if (Shuffles[i]->getType() != VecTy)
> +      return false;
> +
> +    unsigned Index;
> +    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
> +                                    Index))
> +      return false;
> +
> +    Indices.push_back(Index);
> +  }
> +
> +  DEBUG(dbgs() << "Found an interleaved load:" << *LI << "\n");
> +
> +  // A pointer vector can not be the return type of the ldN intrinsics.
> Need to
> +  // load integer vectors first and then convert to pointer vectors.
> +  Type *EltTy = VecTy->getVectorElementType();
> +  if (EltTy->isPointerTy())
> +    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
> +                            VecTy->getVectorNumElements());
> +
> +  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
> +  Type *Tys[2] = {VecTy, PtrTy};
> +  Function *LdNFunc =
> +      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, true), Tys);
> +
> +  IRBuilder<> Builder(LI);
> +  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
> +
> +  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
> +  DEBUG(dbgs() << "   Created:" << *LdN << "\n");
> +
> +  // Replace each strided shufflevector with the corresponding vector
> loaded
> +  // by ldN.
> +  for (unsigned i = 0; i < Shuffles.size(); i++) {
> +    ShuffleVectorInst *SV = Shuffles[i];
> +    unsigned Index = Indices[i];
> +
> +    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
> +
> +    // Convert the integer vector to pointer vector if the element is
> pointer.
> +    if (EltTy->isPointerTy())
> +      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
> +
> +    SV->replaceAllUsesWith(SubVec);
> +
> +    DEBUG(dbgs() << "  Replaced:" << *SV << "\n"
> +                 << "      With:" << *SubVec << "\n");
> +
> +    // Avoid analyzing it twice.
> +    DeadInsts.insert(SV);
> +  }
> +
> +  // Mark this load as dead.
> +  DeadInsts.insert(LI);
> +  return true;
> +}
> +
> +bool AArch64InterleavedAccess::matchInterleavedStore(
> +    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts)
> {
> +  if (DeadInsts.count(SVI) || !SVI->hasOneUse())
> +    return false;
> +
> +  StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
> +  if (!SI || !SI->isSimple())
> +    return false;
> +
> +  // Check if the mask is interleaved and get the interleave factor.
> +  unsigned Factor;
> +  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
> +    return false;
> +
> +  VectorType *VecTy = SVI->getType();
> +  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
> +  Type *EltTy = VecTy->getVectorElementType();
> +  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
> +
> +  // Skip illegal vector types.
> +  unsigned TypeSize = DL->getTypeAllocSizeInBits(SubVecTy);
> +  if (TypeSize != 64 && TypeSize != 128)
> +    return false;
> +
> +  DEBUG(dbgs() << "Found an interleaved store:" << *SI << "\n");
> +
> +  Value *Op0 = SVI->getOperand(0);
> +  Value *Op1 = SVI->getOperand(1);
> +  IRBuilder<> Builder(SI);
> +
> +  // StN intrinsics don't support pointer vectors as arguments. Convert
> pointer
> +  // vectors to integer vectors.
> +  if (EltTy->isPointerTy()) {
> +    Type *IntTy = DL->getIntPtrType(EltTy);
> +    unsigned NumOpElts =
> +        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
> +
> +    // The corresponding integer vector type of the same element size.
> +    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
> +
> +    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
> +    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
> +    SubVecTy = VectorType::get(IntTy, NumSubElts);
> +  }
> +
> +  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
> +  Type *Tys[2] = {SubVecTy, PtrTy};
> +  Function *StNFunc =
> +      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, false),
> Tys);
> +
> +  SmallVector<Value *, 5> Ops;
> +
> +  // Split the shufflevector operands into sub vectors for the new stN
> call.
> +  for (unsigned i = 0; i < Factor; i++)
> +    Ops.push_back(Builder.CreateShuffleVector(
> +        Op0, Op1, getSequentialMask(Builder, NumSubElts * i,
> NumSubElts)));
> +
> +  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
> +  CallInst *StN = Builder.CreateCall(StNFunc, Ops);
> +
> +  (void)StN; // silence warning.
> +  DEBUG(dbgs() << "  Replaced:" << *SI << "'\n");
> +  DEBUG(dbgs() << "      with:" << *StN << "\n");
> +
> +  // Mark this shufflevector and store as dead.
> +  DeadInsts.insert(SI);
> +  DeadInsts.insert(SVI);
> +  return true;
> +}
> +
> +bool AArch64InterleavedAccess::runOnFunction(Function &F) {
> +  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
> +
> +  M = F.getParent();
> +  DL = &M->getDataLayout();
> +
> +  // Holds dead instructions that will be erased later.
> +  SmallSetVector<Instruction *, 32> DeadInsts;
> +  bool Changed = false;
> +  for (auto &I : inst_range(F)) {
> +    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
> +      Changed |= matchInterleavedLoad(SVI, DeadInsts);
> +      Changed |= matchInterleavedStore(SVI, DeadInsts);
> +    }
> +  }
> +
> +  for (auto I : DeadInsts)
> +    I->eraseFromParent();
> +
> +  return Changed;
> +}
>
> Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp (original)
> +++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp Thu Jun 11
> 04:05:02 2015
> @@ -67,6 +67,11 @@ EnableAtomicTidy("aarch64-atomic-cfg-tid
>                            " to make use of cmpxchg flow-based
> information"),
>                   cl::init(true));
>
> +static cl::opt<bool> AArch64InterleavedAccessOpt(
> +    "aarch64-interleaved-access-opt",
> +    cl::desc("Optimize interleaved memory accesses in the AArch64
> backend"),
> +    cl::init(false), cl::Hidden);
> +
>  static cl::opt<bool>
>  EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
>                          cl::desc("Run early if-conversion"),
> @@ -226,6 +231,9 @@ void AArch64PassConfig::addIRPasses() {
>    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
>      addPass(createCFGSimplificationPass());
>
> +  if (TM->getOptLevel() != CodeGenOpt::None &&
> AArch64InterleavedAccessOpt)
> +    addPass(createAArch64InterleavedAccessPass());
> +
>    TargetPassConfig::addIRPasses();
>
>    if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
>
> Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (original)
> +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Thu Jun
> 11 04:05:02 2015
> @@ -407,6 +407,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost
>    return LT.first;
>  }
>
> +unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
> +    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned>
> Indices,
> +    unsigned Alignment, unsigned AddressSpace) {
> +  assert(isa<VectorType>(VecTy) && "Expect vector types");
> +
> +  if (Factor > 1 && Factor < 5 && isTypeLegal(VecTy))
> +    return Factor;
> +
> +  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
> +                                           Alignment, AddressSpace);
> +}
> +
>  unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *>
> Tys) {
>    unsigned Cost = 0;
>    for (auto *I : Tys) {
>
> Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h (original)
> +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h Thu Jun 11
> 04:05:02 2015
> @@ -139,6 +139,11 @@ public:
>
>    bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
>
> +  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
> +                                      unsigned Factor,
> +                                      ArrayRef<unsigned> Indices,
> +                                      unsigned Alignment,
> +                                      unsigned AddressSpace);
>    /// @}
>  };
>
>
> Modified: llvm/trunk/lib/Target/AArch64/CMakeLists.txt
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/CMakeLists.txt?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AArch64/CMakeLists.txt (original)
> +++ llvm/trunk/lib/Target/AArch64/CMakeLists.txt Thu Jun 11 04:05:02 2015
> @@ -38,6 +38,7 @@ add_llvm_target(AArch64CodeGen
>    AArch64PBQPRegAlloc.cpp
>    AArch64RegisterInfo.cpp
>    AArch64SelectionDAGInfo.cpp
> +  AArch64InterleavedAccess.cpp
>    AArch64StorePairSuppress.cpp
>    AArch64Subtarget.cpp
>    AArch64TargetMachine.cpp
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=239514&r1=239513&r2=239514&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Jun 11
> 04:05:02 2015
> @@ -139,7 +139,7 @@ static cl::opt<bool> EnableMemAccessVers
>      cl::desc("Enable symblic stride memory access versioning"));
>
>  static cl::opt<bool> EnableInterleavedMemAccesses(
> -    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
> +    "enable-interleaved-mem-accesses", cl::init(true), cl::Hidden,
>      cl::desc("Enable vectorization on interleaved memory accesses in a
> loop"));
>
>  /// Maximum factor for an interleaved memory access.
>
> Added: llvm/trunk/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll?rev=239514&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll (added)
> +++ llvm/trunk/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll Thu
> Jun 11 04:05:02 2015
> @@ -0,0 +1,197 @@
> +; RUN: llc -march=aarch64 -aarch64-interleaved-access-opt=true < %s |
> FileCheck %s
> +
> +; CHECK-LABEL: load_factor2:
> +; CHECK: ld2 { v0.8b, v1.8b }, [x0]
> +define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
> +  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
> +  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x
> i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
> +  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x
> i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
> +  %add = add nsw <8 x i8> %strided.v0, %strided.v1
> +  ret <8 x i8> %add
> +}
> +
> +; CHECK-LABEL: load_delat3:
> +; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
> +define <4 x i32> @load_delat3(i32* %ptr) {
> +  %base = bitcast i32* %ptr to <12 x i32>*
> +  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
> +  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4
> x i32> <i32 2, i32 5, i32 8, i32 11>
> +  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4
> x i32> <i32 1, i32 4, i32 7, i32 10>
> +  %add = add nsw <4 x i32> %strided.v2, %strided.v1
> +  ret <4 x i32> %add
> +}
> +
> +; CHECK-LABEL: load_factor4:
> +; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
> +define <4 x i32> @load_factor4(i32* %ptr) {
> +  %base = bitcast i32* %ptr to <16 x i32>*
> +  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
> +  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4
> x i32> <i32 0, i32 4, i32 8, i32 12>
> +  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4
> x i32> <i32 2, i32 6, i32 10, i32 14>
> +  %add = add nsw <4 x i32> %strided.v0, %strided.v2
> +  ret <4 x i32> %add
> +}
> +
> +; CHECK-LABEL: store_factor2:
> +; CHECK: st2 { v0.8b, v1.8b }, [x0]
> +define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
> +  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32>
> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12,
> i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
> +  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_factor3:
> +; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
> +define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
> i32> %v2) {
> +  %base = bitcast i32* %ptr to <12 x i32>*
> +  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12
> x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32
> 3, i32 7, i32 11>
> +  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_factor4:
> +; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
> +define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x
> i32> %v2, <4 x i32> %v3) {
> +  %base = bitcast i32* %ptr to <16 x i32>*
> +  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3,
> <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32
> 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
> +  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
> +  ret void
> +}
> +
> +; The following cases test that interleaved access of pointer vectors can
> be
> +; matched to ldN/stN instruction.
> +
> +; CHECK-LABEL: load_ptrvec_factor2:
> +; CHECK: ld2 { v0.2d, v1.2d }, [x0]
> +define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
> +  %base = bitcast i32** %ptr to <4 x i32*>*
> +  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
> +  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2
> x i32> <i32 0, i32 2>
> +  ret <2 x i32*> %strided.v0
> +}
> +
> +; CHECK-LABEL: load_ptrvec_factor3:
> +; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
> +define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x
> i32*>* %ptr2) {
> +  %base = bitcast i32** %ptr to <6 x i32*>*
> +  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
> +  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2
> x i32> <i32 2, i32 5>
> +  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
> +  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2
> x i32> <i32 1, i32 4>
> +  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
> +  ret void
> +}
> +
> +; CHECK-LABEL: load_ptrvec_factor4:
> +; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
> +define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x
> i32*>* %ptr2) {
> +  %base = bitcast i32** %ptr to <8 x i32*>*
> +  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
> +  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2
> x i32> <i32 1, i32 5>
> +  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2
> x i32> <i32 3, i32 7>
> +  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
> +  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_ptrvec_factor2:
> +; CHECK: st2 { v0.2d, v1.2d }, [x0]
> +define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*>
> %v1) {
> +  %base = bitcast i32** %ptr to <4 x i32*>*
> +  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x
> i32> <i32 0, i32 2, i32 1, i32 3>
> +  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_ptrvec_factor3:
> +; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
> +define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*>
> %v1, <2 x i32*> %v2) {
> +  %base = bitcast i32** %ptr to <6 x i32*>*
> +  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32
> 0, i32 1, i32 2, i32 3>
> +  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32
> 0, i32 1, i32 undef, i32 undef>
> +  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u,
> <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
> +  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_ptrvec_factor4:
> +; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
> +define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*>
> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
> +  %base = bitcast i32* %ptr to <8 x i32*>*
> +  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32
> 0, i32 1, i32 2, i32 3>
> +  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32
> 0, i32 1, i32 2, i32 3>
> +  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3,
> <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
> +  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
> +  ret void
> +}
> +
> +; Following cases check that shuffle maskes with undef indices can be
> matched
> +; into ldN/stN instruction.
> +
> +; CHECK-LABEL: load_undef_mask_factor2:
> +; CHECK: ld2 { v0.4s, v1.4s }, [x0]
> +define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
> +  %base = bitcast i32* %ptr to <8 x i32>*
> +  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
> +  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x
> i32> <i32 undef, i32 2, i32 undef, i32 6>
> +  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x
> i32> <i32 undef, i32 3, i32 undef, i32 7>
> +  %add = add nsw <4 x i32> %strided.v0, %strided.v1
> +  ret <4 x i32> %add
> +}
> +
> +; CHECK-LABEL: load_undef_mask_factor3:
> +; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
> +define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
> +  %base = bitcast i32* %ptr to <12 x i32>*
> +  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
> +  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4
> x i32> <i32 2, i32 undef, i32 undef, i32 undef>
> +  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4
> x i32> <i32 1, i32 4, i32 7, i32 10>
> +  %add = add nsw <4 x i32> %strided.v2, %strided.v1
> +  ret <4 x i32> %add
> +}
> +
> +; CHECK-LABEL: load_undef_mask_factor4:
> +; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
> +define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
> +  %base = bitcast i32* %ptr to <16 x i32>*
> +  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
> +  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4
> x i32> <i32 0, i32 4, i32 undef, i32 undef>
> +  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4
> x i32> <i32 2, i32 6, i32 undef, i32 undef>
> +  %add = add nsw <4 x i32> %strided.v0, %strided.v2
> +  ret <4 x i32> %add
> +}
> +
> +; CHECK-LABEL: store_undef_mask_factor2:
> +; CHECK: st2 { v0.4s, v1.4s }, [x0]
> +define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32>
> %v1) {
> +  %base = bitcast i32* %ptr to <8 x i32>*
> +  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x
> i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32
> 7>
> +  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_undef_mask_factor3:
> +; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
> +define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32>
> %v1, <4 x i32> %v2) {
> +  %base = bitcast i32* %ptr to <12 x i32>*
> +  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
> +  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12
> x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32
> 10, i32 3, i32 7, i32 11>
> +  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: store_undef_mask_factor4:
> +; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
> +define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32>
> %v1, <4 x i32> %v2, <4 x i32> %v3) {
> +  %base = bitcast i32* %ptr to <16 x i32>*
> +  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
> +  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3,
> <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32
> 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
> +  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
> +  ret void
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150611/c084f674/attachment.html>