[llvm] a4b6b1e - [InterleaveAccess] Recognise Interleave loads through binary operations
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 29 02:13:40 PDT 2020
Author: David Green
Date: 2020-10-29T09:13:23Z
New Revision: a4b6b1e1c83fdfc5954e0fb631c2e6237236589e
URL: https://github.com/llvm/llvm-project/commit/a4b6b1e1c83fdfc5954e0fb631c2e6237236589e
DIFF: https://github.com/llvm/llvm-project/commit/a4b6b1e1c83fdfc5954e0fb631c2e6237236589e.diff
LOG: [InterleaveAccess] Recognise Interleave loads through binary operations
Instcombine will currently sink identical shuffles though vector binary
operations. This is probably generally useful, but can break up the code
pattern we use to represent an interleaving load group. This patch
reverses that in the InterleaveAccessPass to re-recognise the pattern of
shuffles sunk past binary operations and folds them back if an
interleave group can be created.
Differential Revision: https://reviews.llvm.org/D89489
Added:
Modified:
llvm/lib/CodeGen/InterleavedAccessPass.cpp
llvm/test/CodeGen/AArch64/vldn_shuffle.ll
llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index c4d83547a06c..73771609a792 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -66,6 +66,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -118,6 +119,14 @@ class InterleavedAccess : public FunctionPass {
/// replacements are also performed.
bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles);
+
+ /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
+ /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
+ /// interleaving load. Any newly created shuffles that operate on \p LI will
+ /// be added to \p Shuffles.
+ bool tryReplaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
+ LoadInst *LI);
};
} // end anonymous namespace.
@@ -283,61 +292,85 @@ bool InterleavedAccess::lowerInterleavedLoad(
if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
return false;
+ // Check if all users of this load are shufflevectors. If we encounter any
+ // users that are extractelement instructions or binary operators, we save
+ // them to later check if they can be modified to extract from one of the
+ // shufflevectors instead of the load.
+
SmallVector<ShuffleVectorInst *, 4> Shuffles;
SmallVector<ExtractElementInst *, 4> Extracts;
+ // BinOpShuffles need to be handled a single time in case both operands of the
+ // binop are the same load.
+ SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- // Check if all users of this load are shufflevectors. If we encounter any
- // users that are extractelement instructions, we save them to later check if
- // they can be modifed to extract from one of the shufflevectors instead of
- // the load.
- for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
- auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+ for (auto *User : LI->users()) {
+ auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
continue;
}
- ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
+ auto *BI = dyn_cast<BinaryOperator>(User);
+ if (BI && BI->hasOneUse()) {
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(*BI->user_begin())) {
+ BinOpShuffles.insert(SVI);
+ continue;
+ }
+ }
+ auto *SVI = dyn_cast<ShuffleVectorInst>(User);
if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
return false;
Shuffles.push_back(SVI);
}
- if (Shuffles.empty())
+ if (Shuffles.empty() && BinOpShuffles.empty())
return false;
unsigned Factor, Index;
unsigned NumLoadElements =
cast<FixedVectorType>(LI->getType())->getNumElements();
+ auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
- if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
- MaxFactor, NumLoadElements))
+ if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
+ NumLoadElements))
return false;
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
- Indices.push_back(Index);
- Type *VecTy = Shuffles[0]->getType();
+ Type *VecTy = FirstSVI->getType();
// Check if other shufflevectors are also DE-interleaved of the same type
// and factor as the first shufflevector.
- for (unsigned i = 1; i < Shuffles.size(); i++) {
- if (Shuffles[i]->getType() != VecTy)
+ for (auto *Shuffle : Shuffles) {
+ if (Shuffle->getType() != VecTy)
return false;
-
- if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+ if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
Index))
return false;
Indices.push_back(Index);
}
+ for (auto *Shuffle : BinOpShuffles) {
+ if (Shuffle->getType() != VecTy)
+ return false;
+ if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
+ Index))
+ return false;
+
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ Indices.push_back(Index);
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ Indices.push_back(Index);
+ }
// Try and modify users of the load that are extractelement instructions to
// use the shufflevector instructions instead of the load.
if (!tryReplaceExtracts(Extracts, Shuffles))
return false;
+ if (!tryReplaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI))
+ return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
@@ -352,6 +385,34 @@ bool InterleavedAccess::lowerInterleavedLoad(
return true;
}
+bool InterleavedAccess::tryReplaceBinOpShuffles(
+ ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ for (auto *SVI : BinOpShuffles) {
+ BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
+ ArrayRef<int> Mask = SVI->getShuffleMask();
+
+ auto *NewSVI1 = new ShuffleVectorInst(
+ BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask,
+ SVI->getName(), SVI);
+ auto *NewSVI2 = new ShuffleVectorInst(
+ BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask,
+ SVI->getName(), SVI);
+ Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2,
+ BI->getName(), SVI);
+ SVI->replaceAllUsesWith(NewBI);
+ LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI
+ << "\n With : " << *NewSVI1 << "\n And : "
+ << *NewSVI2 << "\n And : " << *NewBI << "\n");
+ RecursivelyDeleteTriviallyDeadInstructions(SVI);
+ if (NewSVI1->getOperand(0) == LI)
+ Shuffles.push_back(NewSVI1);
+ if (NewSVI2->getOperand(0) == LI)
+ Shuffles.push_back(NewSVI2);
+ }
+ return true;
+}
+
bool InterleavedAccess::tryReplaceExtracts(
ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles) {
@@ -421,7 +482,7 @@ bool InterleavedAccess::lowerInterleavedStore(
if (!SI->isSimple())
return false;
- ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
@@ -461,10 +522,10 @@ bool InterleavedAccess::runOnFunction(Function &F) {
bool Changed = false;
for (auto &I : instructions(F)) {
- if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+ if (auto *LI = dyn_cast<LoadInst>(&I))
Changed |= lowerInterleavedLoad(LI, DeadInsts);
- if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+ if (auto *SI = dyn_cast<StoreInst>(&I))
Changed |= lowerInterleavedStore(SI, DeadInsts);
}
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 4c501e5403f9..99100a2ab4c8 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -7,13 +7,10 @@ define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDs
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q0, [x0], #32
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32
+; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s
+; CHECK-NEXT: str q2, [x1, x8]
; CHECK-NEXT: add x8, x8, #16 // =16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB0_1
@@ -50,27 +47,11 @@ define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDs
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q0, [x0, #16]
-; CHECK-NEXT: ldr q1, [x0], #48
-; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: ext v3.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: ext v5.16b, v1.16b, v3.16b, #12
-; CHECK-NEXT: ext v3.16b, v3.16b, v2.16b, #4
-; CHECK-NEXT: dup v4.4s, v0.s[1]
-; CHECK-NEXT: mov v2.s[0], v1.s[2]
-; CHECK-NEXT: dup v1.4s, v0.s[2]
-; CHECK-NEXT: mov v0.s[2], v0.s[0]
-; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #12
-; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8
-; CHECK-NEXT: mov v5.s[3], v4.s[3]
-; CHECK-NEXT: mov v3.s[3], v1.s[3]
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fadd v1.4s, v3.4s, v5.4s
-; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: str q0, [x1, x8]
+; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
+; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s
+; CHECK-NEXT: str q3, [x1, x8]
; CHECK-NEXT: add x8, x8, #16 // =16
; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT: b.ne .LBB1_1
@@ -110,37 +91,15 @@ define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDs
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q2, q3, [x0, #32]
-; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT: add x9, x1, x8
; CHECK-NEXT: add x8, x8, #32 // =32
-; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s
-; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s
-; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: zip1 v5.4s, v2.4s, v3.4s
-; CHECK-NEXT: trn2 v7.4s, v2.4s, v3.4s
-; CHECK-NEXT: zip1 v4.4s, v0.4s, v1.4s
-; CHECK-NEXT: trn2 v6.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8
-; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v4.16b, v5.16b, v4.16b, #8
-; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s
-; CHECK-NEXT: ext v0.16b, v6.16b, v0.16b, #8
-; CHECK-NEXT: ext v6.16b, v7.16b, v6.16b, #8
-; CHECK-NEXT: mov v2.s[3], v3.s[2]
-; CHECK-NEXT: ext v0.16b, v5.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v4.16b, v4.16b, #8
-; CHECK-NEXT: ext v4.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fadd v2.4s, v4.4s, v3.4s
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT: fadd v3.4s, v0.4s, v1.4s
-; CHECK-NEXT: add x0, x0, #64 // =64
-; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x9]
+; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s
+; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s
+; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %while.end
; CHECK-NEXT: ret
@@ -184,16 +143,13 @@ define void @twosrc(float* nocapture readonly %pSrc, float* nocapture readonly %
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x10, x1, x8
-; CHECK-NEXT: ldp q0, q1, [x9]
-; CHECK-NEXT: ldp q3, q2, [x10]
+; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9]
+; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10]
; CHECK-NEXT: add x8, x8, #32 // =32
; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: fmul v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: str q0, [x2], #16
+; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s
+; CHECK-NEXT: str q4, [x2], #16
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: // %bb.2: // %while.end
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index 7c4fef3c71c5..c04243ee5754 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -4,94 +4,49 @@
define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: arm_cmplx_mag_squared_f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: beq.w .LBB0_9
+; CHECK-NEXT: beq .LBB0_8
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: cmp r2, #8
-; CHECK-NEXT: blo.w .LBB0_6
+; CHECK-NEXT: blo .LBB0_9
; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
; CHECK-NEXT: add.w r3, r0, r2, lsl #2
; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: itt hi
; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1
; CHECK-NEXT: cmphi r3, r0
-; CHECK-NEXT: bhi .LBB0_6
+; CHECK-NEXT: bhi .LBB0_9
; CHECK-NEXT: @ %bb.3: @ %vector.ph
-; CHECK-NEXT: bic r5, r2, #7
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: sub.w r3, r5, #8
-; CHECK-NEXT: and r8, r2, #7
-; CHECK-NEXT: add.w r12, r1, r5, lsl #1
-; CHECK-NEXT: add.w r3, r4, r3, lsr #3
-; CHECK-NEXT: mov r7, r3
-; CHECK-NEXT: add.w r3, r0, r5, lsl #2
+; CHECK-NEXT: bic r4, r2, #7
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: sub.w r12, r4, #8
+; CHECK-NEXT: and r7, r2, #7
+; CHECK-NEXT: add.w r3, r3, r12, lsr #3
+; CHECK-NEXT: add.w r12, r1, r4, lsl #1
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: add.w r3, r0, r4, lsl #2
; CHECK-NEXT: .LBB0_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q0, [r0], #32
-; CHECK-NEXT: mov lr, r7
+; CHECK-NEXT: vld20.16 {q0, q1}, [r0]
+; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: subs.w lr, lr, #1
-; CHECK-NEXT: vmul.f16 q1, q0, q0
-; CHECK-NEXT: mov r7, lr
-; CHECK-NEXT: vmovx.f16 s0, s5
-; CHECK-NEXT: vmovx.f16 s8, s6
-; CHECK-NEXT: vmov r4, s0
-; CHECK-NEXT: vmovx.f16 s0, s4
-; CHECK-NEXT: vmov r6, s0
-; CHECK-NEXT: vmov.16 q0[0], r6
-; CHECK-NEXT: vmov r6, s4
-; CHECK-NEXT: vmov.16 q0[1], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vmovx.f16 s8, s7
-; CHECK-NEXT: vmov.16 q0[2], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmov.16 q0[3], r4
-; CHECK-NEXT: vmul.f16 q2, q2, q2
-; CHECK-NEXT: vmovx.f16 s12, s8
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vmov.16 q0[4], r4
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmovx.f16 s12, s10
-; CHECK-NEXT: vmov.16 q0[5], r4
-; CHECK-NEXT: vmov r4, s12
-; CHECK-NEXT: vmov.16 q3[0], r6
-; CHECK-NEXT: vmov.16 q0[6], r4
-; CHECK-NEXT: vmov r4, s5
-; CHECK-NEXT: vmov.16 q3[1], r4
-; CHECK-NEXT: vmov r4, s6
-; CHECK-NEXT: vmov.16 q3[2], r4
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: vmov.16 q3[3], r4
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: vmov.16 q3[4], r4
-; CHECK-NEXT: vmov r4, s9
-; CHECK-NEXT: vmov.16 q3[5], r4
-; CHECK-NEXT: vmov r4, s10
-; CHECK-NEXT: vmov.16 q3[6], r4
-; CHECK-NEXT: vmov r4, s11
-; CHECK-NEXT: vmovx.f16 s4, s11
-; CHECK-NEXT: vmov.16 q3[7], r4
-; CHECK-NEXT: vmov r4, s4
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vadd.f16 q0, q0, q3
-; CHECK-NEXT: vstrb.8 q0, [r1], #16
+; CHECK-NEXT: vld21.16 {q0, q1}, [r0]!
+; CHECK-NEXT: mov r5, lr
+; CHECK-NEXT: vmul.f16 q2, q0, q0
+; CHECK-NEXT: vfma.f16 q2, q1, q1
+; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB0_4
; CHECK-NEXT: b .LBB0_5
; CHECK-NEXT: .LBB0_5: @ %middle.block
-; CHECK-NEXT: cmp r5, r2
-; CHECK-NEXT: mov lr, r8
-; CHECK-NEXT: bne .LBB0_7
-; CHECK-NEXT: b .LBB0_9
-; CHECK-NEXT: .LBB0_6:
-; CHECK-NEXT: mov r3, r0
-; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov lr, r2
-; CHECK-NEXT: .LBB0_7: @ %while.body.preheader26
+; CHECK-NEXT: cmp r4, r2
+; CHECK-NEXT: mov lr, r7
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r4, r5, r7, pc}
+; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26
; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: .LBB0_8: @ %while.body
+; CHECK-NEXT: .LBB0_7: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldr.16 s0, [r3]
; CHECK-NEXT: vldr.16 s2, [r3, #2]
@@ -100,9 +55,14 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
; CHECK-NEXT: vfma.f16 s0, s2, s2
; CHECK-NEXT: vstr.16 s0, [r12]
; CHECK-NEXT: add.w r12, r12, #2
-; CHECK-NEXT: le lr, .LBB0_8
-; CHECK-NEXT: .LBB0_9: @ %while.end
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: le lr, .LBB0_7
+; CHECK-NEXT: .LBB0_8: @ %while.end
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: .LBB0_9:
+; CHECK-NEXT: mov r3, r0
+; CHECK-NEXT: mov r12, r1
+; CHECK-NEXT: mov lr, r2
+; CHECK-NEXT: b .LBB0_6
entry:
%cmp.not11 = icmp eq i32 %numSamples, 0
br i1 %cmp.not11, label %while.end, label %while.body.preheader
@@ -195,37 +155,28 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n
; CHECK-NEXT: cmphi r3, r0
; CHECK-NEXT: bhi .LBB1_9
; CHECK-NEXT: @ %bb.3: @ %vector.ph
-; CHECK-NEXT: bic r5, r2, #3
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: subs r3, r5, #4
+; CHECK-NEXT: bic r4, r2, #3
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: sub.w r12, r4, #4
; CHECK-NEXT: and r7, r2, #3
-; CHECK-NEXT: add.w r12, r1, r5, lsl #2
-; CHECK-NEXT: add.w r3, r4, r3, lsr #2
-; CHECK-NEXT: mov r4, r3
-; CHECK-NEXT: add.w r3, r0, r5, lsl #3
+; CHECK-NEXT: add.w r3, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r1, r4, lsl #2
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: add.w r3, r0, r4, lsl #3
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r0], #32
-; CHECK-NEXT: mov lr, r4
-; CHECK-NEXT: vmul.f32 q1, q1, q1
-; CHECK-NEXT: vmul.f32 q0, q0, q0
-; CHECK-NEXT: vmov.f64 d4, d2
+; CHECK-NEXT: vld20.32 {q0, q1}, [r0]
+; CHECK-NEXT: mov lr, r5
; CHECK-NEXT: subs.w lr, lr, #1
-; CHECK-NEXT: mov r4, lr
-; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmov.f32 s9, s6
-; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s14, s1
-; CHECK-NEXT: vmov.f32 s11, s2
-; CHECK-NEXT: vmov.f32 s15, s3
-; CHECK-NEXT: vadd.f32 q0, q3, q2
-; CHECK-NEXT: vstrb.8 q0, [r1], #16
+; CHECK-NEXT: vld21.32 {q0, q1}, [r0]!
+; CHECK-NEXT: mov r5, lr
+; CHECK-NEXT: vmul.f32 q2, q0, q0
+; CHECK-NEXT: vfma.f32 q2, q1, q1
+; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: bne .LBB1_4
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_5: @ %middle.block
-; CHECK-NEXT: cmp r5, r2
+; CHECK-NEXT: cmp r4, r2
; CHECK-NEXT: mov lr, r7
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r4, r5, r7, pc}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll
index a1e1b4dbe748..47114092bfb1 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll
@@ -7,12 +7,15 @@ target triple = "aarch64--linux-gnu"
define <4 x float> @vld2(<8 x float>* %pSrc) {
; CHECK-LABEL: @vld2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L2:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L3:%.*]] = shufflevector <8 x float> [[L2]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L26:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L43]], [[L26]]
; CHECK-NEXT: ret <4 x float> [[L6]]
;
entry:
@@ -28,15 +31,19 @@ entry:
define <4 x float> @vld3(<12 x float>* %pSrc) {
; CHECK-LABEL: @vld3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, <12 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L2:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L3:%.*]] = shufflevector <12 x float> [[L2]], <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <12 x float> [[L4]], <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]]
-; CHECK-NEXT: [[L7:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L8:%.*]] = shufflevector <12 x float> [[L7]], <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L8]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <12 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L29:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L46]], [[L29]]
+; CHECK-NEXT: [[L73:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L73]]
; CHECK-NEXT: ret <4 x float> [[L9]]
;
entry:
@@ -55,17 +62,22 @@ entry:
define <4 x float> @vld4(<16 x float>* %pSrc) {
; CHECK-LABEL: @vld4(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, <16 x float>* [[PSRC:%.*]], align 4
-; CHECK-NEXT: [[L3:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L4:%.*]] = shufflevector <16 x float> [[L3]], <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT: [[L5:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L6:%.*]] = shufflevector <16 x float> [[L5]], <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L6]], [[L4]]
-; CHECK-NEXT: [[L8:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L9:%.*]] = shufflevector <16 x float> [[L8]], <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT: [[L10:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L11:%.*]] = shufflevector <16 x float> [[L10]], <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L11]], [[L9]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float>* [[PSRC:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[L312:%.*]] = fmul <4 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[L59:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L59]], [[L312]]
+; CHECK-NEXT: [[L86:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[L103:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L103]], [[L86]]
; CHECK-NEXT: ret <4 x float> [[L12]]
;
entry:
@@ -86,13 +98,17 @@ entry:
define <4 x float> @twosrc(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
; CHECK-LABEL: @twosrc(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4
-; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[L6:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L7:%.*]] = shufflevector <8 x float> [[L6]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L7]], [[L5]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN7:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 1
+; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[L63:%.*]] = fmul <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L63]], [[L46]]
; CHECK-NEXT: ret <4 x float> [[L8]]
;
entry:
@@ -109,14 +125,17 @@ entry:
define <4 x float> @twosrc2(<8 x float>* %pSrc1, <8 x float>* %pSrc2) {
; CHECK-LABEL: @twosrc2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4
-; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4
-; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]]
-; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[WIDE_VEC26]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[S1]], [[S2]]
-; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L5]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>*
+; CHECK-NEXT: [[LDN4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 1
+; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP1]]
+; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L43]]
; CHECK-NEXT: ret <4 x float> [[L8]]
;
entry:
More information about the llvm-commits
mailing list