[llvm] [IA]: Construct (de)interleave4 out of (de)interleave2 (PR #89276)
Hassnaa Hamdi via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 18 10:53:53 PDT 2024
https://github.com/hassnaaHamdi updated https://github.com/llvm/llvm-project/pull/89276
>From 4547f272cb02c95db3d2b567cbd63c6cfcd3dcb7 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 23:44:44 +0000
Subject: [PATCH 1/6] [AArch64][Interleave]: Add test precommit
Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2
---
.../AArch64/sve-interleave_accesses4-load.ll | 106 ++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
new file mode 100644
index 0000000000000..dcade71ccb684
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: interleave:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld2w { z1.s, z2.s }, p0/z, [x1]
+; CHECK-NEXT: ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: uzp2 z5.s, z1.s, z3.s
+; CHECK-NEXT: uzp1 z6.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z7.s, z2.s, z4.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z4.s
+; CHECK-NEXT: add z2.s, z0.s, z6.s
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: lsl z3.s, p0/m, z3.s, z0.s
+; CHECK-NEXT: sub z1.s, z1.s, z0.s
+; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z7.s
+; CHECK-NEXT: zip1 z4.s, z2.s, z3.s
+; CHECK-NEXT: zip2 z2.s, z2.s, z3.s
+; CHECK-NEXT: zip1 z5.s, z1.s, z0.s
+; CHECK-NEXT: zip2 z3.s, z1.s, z0.s
+; CHECK-NEXT: st2w { z4.s, z5.s }, p0, [x0]
+; CHECK-NEXT: st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = add nsw <vscale x 4 x i32> %x, %5
+ %10 = sub nsw <vscale x 4 x i32> %7, %x
+ %11 = shl <vscale x 4 x i32> %6, %x
+ %12 = ashr <vscale x 4 x i32> %8, %x
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
+ ret void
+}
+
+define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
+; CHECK-LABEL: wide_interleave:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld2w { z2.s, z3.s }, p0/z, [x1]
+; CHECK-NEXT: ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
+; CHECK-NEXT: ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
+; CHECK-NEXT: uzp2 z26.s, z2.s, z4.s
+; CHECK-NEXT: uzp1 z27.s, z2.s, z4.s
+; CHECK-NEXT: uzp2 z28.s, z3.s, z5.s
+; CHECK-NEXT: uzp1 z2.s, z3.s, z5.s
+; CHECK-NEXT: add z3.s, z0.s, z27.s
+; CHECK-NEXT: movprfx z4, z26
+; CHECK-NEXT: lsl z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT: sub z2.s, z2.s, z0.s
+; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z28.s
+; CHECK-NEXT: zip1 z26.s, z3.s, z4.s
+; CHECK-NEXT: zip2 z3.s, z3.s, z4.s
+; CHECK-NEXT: zip1 z27.s, z2.s, z0.s
+; CHECK-NEXT: zip2 z4.s, z2.s, z0.s
+; CHECK-NEXT: uzp2 z0.s, z6.s, z24.s
+; CHECK-NEXT: uzp1 z2.s, z6.s, z24.s
+; CHECK-NEXT: st2w { z26.s, z27.s }, p0, [x0]
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: add z2.s, z1.s, z2.s
+; CHECK-NEXT: st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: uzp2 z3.s, z7.s, z25.s
+; CHECK-NEXT: uzp1 z4.s, z7.s, z25.s
+; CHECK-NEXT: zip1 z5.s, z2.s, z0.s
+; CHECK-NEXT: sub z4.s, z4.s, z1.s
+; CHECK-NEXT: asrr z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: zip2 z2.s, z2.s, z0.s
+; CHECK-NEXT: zip1 z6.s, z4.s, z1.s
+; CHECK-NEXT: zip2 z3.s, z4.s, z1.s
+; CHECK-NEXT: st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
+; CHECK-NEXT: st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
+; CHECK-NEXT: ret
+ %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
+ %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+ %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+ %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
+ %9 = add nsw <vscale x 8 x i32> %x, %5
+ %10 = sub nsw <vscale x 8 x i32> %7, %x
+ %11 = shl <vscale x 8 x i32> %6, %x
+ %12 = ashr <vscale x 8 x i32> %8, %x
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
+ %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
+ %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
+ store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
+ ret void
+}
>From 393ec7f46e81299afed1692427cc0271b861833d Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 18 Apr 2024 17:30:51 +0000
Subject: [PATCH 2/6] [IA]: Construct (de)interleave4 out of (de)interleave2
- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences,
and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV.
- Tests are added for targets that support SV.
Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 83 +++++++++++++++++--
.../Target/AArch64/AArch64ISelLowering.cpp | 40 ++++++---
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 37 +++++++--
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +-
.../CodeGen/AArch64/sve-deinterleave-load.ll | 78 +++++++++++++++++
.../RISCV/rvv/sve-deinterleave-load.ll | 74 +++++++++++++++++
8 files changed, 294 insertions(+), 28 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 974a5301f6e29..50aa09149632f 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -56,6 +56,8 @@
#include <cstdint>
#include <iterator>
#include <map>
+#include <queue>
+#include <stack>
#include <string>
#include <utility>
#include <vector>
@@ -3158,6 +3160,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3169,6 +3172,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8c9065aec7faa..21e6ba79e365a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,6 +70,7 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
+#include <queue>
#include <utility>
using namespace llvm;
@@ -488,12 +489,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
+ SmallVector<Value *> TempLeafNodes, LeafNodes;
+ std::map<IntrinsicInst *, bool> mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ while (!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ // iterate over extract users of deinterleave
+ for (auto UserExtract : CurrentDI->users()) {
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ // iterate over deinterleave users of extract
+ for (auto UserDI : UserExtract->users()) {
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI || Child_DI->getIntrinsicID() !=
+ Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ TempLeafNodes.push_back(UserExtract);
+ TempDeadInsts.push_back(Extract);
+ } else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ }
+ // sort the deinterleaved nodes in the order that
+ // they will be extracted from the target-specific intrinsic.
+ for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
+ for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
+ TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -509,14 +555,38 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
+ SmallVector<Value *> TempLeafNodes, LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while (!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for (unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() !=
+ Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ TempLeafNodes.push_back(op);
+ }
+ }
+ for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+ for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -537,7 +607,8 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+
+ else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4f819f5fcdd2..5126716457b1b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16586,15 +16586,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front()->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16650,9 +16651,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
- else
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ for (unsigned I = 0; I < LeafNodes.size(); I++) {
+ llvm::Value *CurrentExtract = LeafNodes[I];
+ Value *Newextrct = Builder.CreateExtractValue(Result, I);
+ CurrentExtract->replaceAllUsesWith(Newextrct);
+ }
+ return true;
+ } else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16661,15 +16672,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16714,9 +16725,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
- else
+ if (UseScalable) {
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ } else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 48a4ea91c2782..4969ed476a9bb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -696,9 +696,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index af3950773e4d0..2cf26d76c7afb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21550,8 +21550,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21559,10 +21559,13 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy =
+ (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front()->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21590,6 +21593,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ for (unsigned I = 0; I < LeafNodes.size(); I++) {
+ auto CurrentExtract = LeafNodes[I];
+ Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+ CurrentExtract->replaceAllUsesWith(NewExtract);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21600,8 +21616,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
return true;
}
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21609,10 +21625,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21638,6 +21654,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3b8eb3c88901a..8ae43b02af5bf 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -875,10 +875,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 0000000000000..606bb93e309e1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x10
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w9, #1024 // =0x400
+; CHECK-NEXT: neg x10, x10
+; CHECK-NEXT: rdvl x11, #4
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x12, x1, x8
+; CHECK-NEXT: adds x9, x9, x10
+; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x2, x8
+; CHECK-NEXT: ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x0, x8
+; CHECK-NEXT: add x8, x8, x11
+; CHECK-NEXT: add z16.s, z4.s, z0.s
+; CHECK-NEXT: sub z17.s, z1.s, z5.s
+; CHECK-NEXT: movprfx z18, z2
+; CHECK-NEXT: lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT: movprfx z19, z3
+; CHECK-NEXT: asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT: st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+ %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+ %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+ %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+ %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+ %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+ %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+ %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+ %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+ %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+ %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+ %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+ %16 = add nsw <vscale x 4 x i32> %12, %5
+ %17 = sub nsw <vscale x 4 x i32> %7, %14
+ %18 = shl <vscale x 4 x i32> %6, %13
+ %19 = ashr <vscale x 4 x i32> %8, %15
+ %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+ %index.next = add nuw i64 %index, %1
+ %21 = icmp eq i64 %index.next, 1024
+ br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 0000000000000..2ea14b13265c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: srli a3, a4, 1
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: li a5, 1024
+; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma
+; CHECK-NEXT: .LBB0_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vlseg4e32.v v8, (a1)
+; CHECK-NEXT: vlseg4e32.v v16, (a2)
+; CHECK-NEXT: vadd.vv v8, v16, v8
+; CHECK-NEXT: vsub.vv v10, v10, v18
+; CHECK-NEXT: vsll.vv v12, v12, v20
+; CHECK-NEXT: vsra.vv v14, v14, v22
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: sub a5, a5, a3
+; CHECK-NEXT: add a0, a0, a4
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: bnez a5, .LBB0_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+ %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+ %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+ %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+ %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+ %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+ %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+ %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+ %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+ %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+ %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+ %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+ %16 = add nsw <vscale x 4 x i32> %12, %5
+ %17 = sub nsw <vscale x 4 x i32> %7, %14
+ %18 = shl <vscale x 4 x i32> %6, %13
+ %19 = ashr <vscale x 4 x i32> %8, %15
+ %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+ %index.next = add nuw i64 %index, %1
+ %21 = icmp eq i64 %index.next, 1024
+ br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
>From e26c9235fb26845ef67afbbca13f99d9677081ae Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Mon, 29 Apr 2024 05:03:36 +0000
Subject: [PATCH 3/6] [PatternMatch]: Add m_Interleave and m_Deinterleave
matchers.
Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a
---
llvm/include/llvm/IR/PatternMatch.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 526b7258b8ab7..91f65bbca8f63 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2856,6 +2856,17 @@ inline VScaleVal_match m_VScale() {
return VScaleVal_match();
}
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_Interleave2(const Opnd0 &Op0, const Opnd1 &Op1) {
+ return m_Intrinsic<Intrinsic::vector_interleave2>(Op0, Op1);
+}
+
+template <typename Opnd>
+inline typename m_Intrinsic_Ty<Opnd>::Ty m_Deinterleave2(const Opnd &Op) {
+ return m_Intrinsic<Intrinsic::vector_deinterleave2>(Op);
+}
+
template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
struct LogicalOp_match {
LHS L;
>From a086c0d06ee220fd5ba9c376771761e1de794596 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 17:25:10 +0000
Subject: [PATCH 4/6] [AArch64]: Use PatternMatch to spot (de)interleave
accesses
Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 -
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 83 +-------
.../Target/AArch64/AArch64ISelLowering.cpp | 187 +++++++++++++-----
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 -
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 37 +---
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +-
.../CodeGen/AArch64/sve-deinterleave-load.ll | 78 --------
.../AArch64/sve-interleave_accesses4-load.ll | 106 ----------
.../RISCV/rvv/sve-deinterleave-load.ll | 74 -------
.../AArch64/sve-deinterleave4.ll | 105 ++++++++++
.../AArch64/sve-interleave4.ll | 63 ++++++
11 files changed, 316 insertions(+), 427 deletions(-)
delete mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
delete mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
delete mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 50aa09149632f..974a5301f6e29 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -56,8 +56,6 @@
#include <cstdint>
#include <iterator>
#include <map>
-#include <queue>
-#include <stack>
#include <string>
#include <utility>
#include <vector>
@@ -3160,7 +3158,6 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- SmallVector<Value *> &LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3172,7 +3169,6 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- SmallVector<Value *> &LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 21e6ba79e365a..8c9065aec7faa 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,7 +70,6 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
-#include <queue>
#include <utility>
using namespace llvm;
@@ -489,57 +488,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
- std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
- SmallVector<Value *> TempLeafNodes, LeafNodes;
- std::map<IntrinsicInst *, bool> mp;
- SmallVector<Instruction *> TempDeadInsts;
-
- DeinterleaveTreeQueue.push(DI);
- while (!DeinterleaveTreeQueue.empty()) {
- auto CurrentDI = DeinterleaveTreeQueue.top();
- DeinterleaveTreeQueue.pop();
- TempDeadInsts.push_back(CurrentDI);
- // iterate over extract users of deinterleave
- for (auto UserExtract : CurrentDI->users()) {
- Instruction *Extract = dyn_cast<Instruction>(UserExtract);
- if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
- continue;
- bool IsLeaf = true;
- // iterate over deinterleave users of extract
- for (auto UserDI : UserExtract->users()) {
- IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
- if (!Child_DI || Child_DI->getIntrinsicID() !=
- Intrinsic::experimental_vector_deinterleave2)
- continue;
- IsLeaf = false;
- if (mp.count(Child_DI) == 0) {
- DeinterleaveTreeQueue.push(Child_DI);
- }
- continue;
- }
- if (IsLeaf) {
- TempLeafNodes.push_back(UserExtract);
- TempDeadInsts.push_back(Extract);
- } else {
- TempDeadInsts.push_back(Extract);
- }
- }
- }
- // sort the deinterleaved nodes in the order that
- // they will be extracted from the target-specific intrinsic.
- for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
- LeafNodes.push_back(TempLeafNodes[I]);
-
- for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
- LeafNodes.push_back(TempLeafNodes[I]);
-
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
- TempDeadInsts.rend());
+ DeadInsts.push_back(DI);
DeadInsts.push_back(LI);
return true;
}
@@ -555,38 +509,14 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
- std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
- SmallVector<Value *> TempLeafNodes, LeafNodes;
- SmallVector<Instruction *> TempDeadInsts;
-
- IeinterleaveTreeQueue.push(II);
- while (!IeinterleaveTreeQueue.empty()) {
- auto node = IeinterleaveTreeQueue.front();
- TempDeadInsts.push_back(node);
- IeinterleaveTreeQueue.pop();
- for (unsigned i = 0; i < 2; i++) {
- auto op = node->getOperand(i);
- if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
- if (CurrentII->getIntrinsicID() !=
- Intrinsic::experimental_vector_interleave2)
- continue;
- IeinterleaveTreeQueue.push(CurrentII);
- continue;
- }
- TempLeafNodes.push_back(op);
- }
- }
- for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
- LeafNodes.push_back(TempLeafNodes[I]);
- for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
- LeafNodes.push_back(TempLeafNodes[I]);
+
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
+ DeadInsts.push_back(II);
return true;
}
@@ -607,8 +537,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-
- else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+ if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5126716457b1b..4985907f7489b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16585,18 +16585,74 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
+bool getDeinterleavedValues(Value *DI,
+ SmallVectorImpl<Value *> &DeinterleavedValues,
+ SmallVectorImpl<Instruction *> &DeadInsts) {
+ if (!DI->hasNUses(2))
+ return false;
+
+ // make sure that the users of DI are extractValue instructions
+ auto *Extr0 = *(++DI->user_begin());
+ if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
+ return false;
+ auto *Extr1 = *(DI->user_begin());
+ if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+ return false;
+
+ // each extractValue instruction is expected to have a single user,
+ // which should be another DI
+ if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+ return false;
+ auto *DI1 = *(Extr0->user_begin());
+ if (!match(DI1, m_Deinterleave2(m_Value())))
+ return false;
+ auto *DI2 = *(Extr1->user_begin());
+ if (!match(DI2, m_Deinterleave2(m_Value())))
+ return false;
+
+ if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+ return false;
+
+ // Leaf nodes of the deinterleave tree
+ auto *A = *(++DI1->user_begin());
+ auto *C = *(DI1->user_begin());
+ auto *B = *(++DI2->user_begin());
+ auto *D = *(DI2->user_begin());
+
+ DeinterleavedValues.push_back(A);
+ DeinterleavedValues.push_back(B);
+ DeinterleavedValues.push_back(C);
+ DeinterleavedValues.push_back(D);
+
+ // These Values will not be used anymre,
+ // DI4 will be created instead of nested DI1 and DI2
+ DeadInsts.push_back(cast<Instruction>(DI1));
+ DeadInsts.push_back(cast<Instruction>(Extr0));
+ DeadInsts.push_back(cast<Instruction>(DI2));
+ DeadInsts.push_back(cast<Instruction>(Extr1));
+
+ return true;
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+ IntrinsicInst *DI, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- const unsigned Factor = std::max(2, (int)LeafNodes.size());
-
- VectorType *VTy = (LeafNodes.size() > 0)
- ? cast<VectorType>(LeafNodes.front()->getType())
- : cast<VectorType>(DI->getType()->getContainedType(0));
+ SmallVector<Value *, 4> DeinterleavedValues;
+ SmallVector<Instruction *, 10> DeadInsts;
const DataLayout &DL = DI->getModule()->getDataLayout();
+ unsigned Factor = 2;
+ VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+
+ if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+ Factor = DeinterleavedValues.size();
+ VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+ }
+ assert((Factor == 2 || Factor == 4) &&
+ "Currently supported Factors are 2 or 4");
+
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
return false;
@@ -16607,7 +16663,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
return false;
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
-
VectorType *LdTy =
VectorType::get(VTy->getElementType(),
VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -16617,7 +16672,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
UseScalable, LdTy, PtrTy);
IRBuilder<> Builder(LI);
-
Value *Pred = nullptr;
if (UseScalable)
Pred =
@@ -16626,9 +16680,8 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Value *BaseAddr = LI->getPointerOperand();
Value *Result;
if (NumLoads > 1) {
- Value *Left = PoisonValue::get(VTy);
- Value *Right = PoisonValue::get(VTy);
-
+ // Create multiple legal small ldN instead of a wide one.
+ SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
for (unsigned I = 0; I < NumLoads; ++I) {
Value *Offset = Builder.getInt64(I * Factor);
@@ -16638,50 +16691,79 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
else
LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
-
Value *Idx =
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
- Left = Builder.CreateInsertVector(
- VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
- Right = Builder.CreateInsertVector(
- VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
+ for (int J = 0; J < Factor; ++J) {
+ WideValues[J] = Builder.CreateInsertVector(
+ VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
+ }
+ }
+ if (Factor == 2)
+ Result = PoisonValue::get(StructType::get(VTy, VTy));
+ else
+ Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
+ // Construct the wide result out of the small results.
+ for (int J = 0; J < Factor; ++J) {
+ Result = Builder.CreateInsertValue(Result, WideValues[J], J);
}
-
- Result = PoisonValue::get(DI->getType());
- Result = Builder.CreateInsertValue(Result, Left, 0);
- Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable) {
+ if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
- if (Factor == 2) {
- DI->replaceAllUsesWith(Result);
- return true;
- }
- for (unsigned I = 0; I < LeafNodes.size(); I++) {
- llvm::Value *CurrentExtract = LeafNodes[I];
- Value *Newextrct = Builder.CreateExtractValue(Result, I);
- CurrentExtract->replaceAllUsesWith(Newextrct);
- }
- return true;
- } else
+ else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
+ if (Factor > 2) {
+ for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+ llvm::Value *CurrentExtract = DeinterleavedValues[I];
+ Value *NewExtract = Builder.CreateExtractValue(Result, I);
+ CurrentExtract->replaceAllUsesWith(NewExtract);
+ cast<Instruction>(CurrentExtract)->eraseFromParent();
+ }
+ for (auto &dead : DeadInsts)
+ dead->eraseFromParent();
+ return true;
+ }
DI->replaceAllUsesWith(Result);
return true;
}
-bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
- // Only interleave2 supported at present.
- if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
- return false;
+bool getValuesToInterleaved(Value *II,
+ SmallVectorImpl<Value *> &ValuesToInterleave) {
+ Value *A, *B, *C, *D;
+ // Try to match interleave of Factor 4
+ if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
+ m_Interleave2(m_Value(B), m_Value(D))))) {
+ ValuesToInterleave.push_back(A);
+ ValuesToInterleave.push_back(B);
+ ValuesToInterleave.push_back(C);
+ ValuesToInterleave.push_back(D);
+ return true;
+ }
- // leaf nodes are the nodes that will be interleaved
- const unsigned Factor = LeafNodes.size();
+ // Try to match interleave of Factor 2
+ if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
+ ValuesToInterleave.push_back(A);
+ ValuesToInterleave.push_back(B);
+ return true;
+ }
+
+ return false;
+}
- VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
+bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI) const {
+ LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
+
+ SmallVector<Value *, 4> ValuesToInterleave;
+ if (!getValuesToInterleaved(II, ValuesToInterleave))
+ return false;
+ unsigned Factor = ValuesToInterleave.size();
+ assert((Factor == 2 || Factor == 4) &&
+ "Currently supported Factors are 2 or 4");
+ VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
+
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
return false;
@@ -16710,28 +16792,25 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Pred =
Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
- Value *L = II->getOperand(0);
- Value *R = II->getOperand(1);
-
+ auto WideValues = ValuesToInterleave;
+ if (UseScalable)
+ ValuesToInterleave.push_back(Pred);
+ ValuesToInterleave.push_back(BaseAddr);
for (unsigned I = 0; I < NumStores; ++I) {
Value *Address = BaseAddr;
if (NumStores > 1) {
Value *Offset = Builder.getInt64(I * Factor);
Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
-
Value *Idx =
Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
- L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
- R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
+ for (int J = 0; J < Factor; J++) {
+ ValuesToInterleave[J] =
+ Builder.CreateExtractVector(StTy, WideValues[J], Idx);
+ }
+ // update the address
+ ValuesToInterleave[ValuesToInterleave.size() - 1] = Address;
}
-
- if (UseScalable) {
- SmallVector<Value *> Args(LeafNodes);
- Args.push_back(Pred);
- Args.push_back(Address);
- Builder.CreateCall(StNFunc, Args);
- } else
- Builder.CreateCall(StNFunc, {L, R, Address});
+ Builder.CreateCall(StNFunc, ValuesToInterleave);
}
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 4969ed476a9bb..48a4ea91c2782 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -696,11 +696,9 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- SmallVector<Value *> &LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2cf26d76c7afb..af3950773e4d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21550,8 +21550,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21559,13 +21559,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- unsigned Factor = std::max(2, (int)LeafNodes.size());
+ unsigned Factor = 2;
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy =
- (LeafNodes.size() > 0)
- ? cast<VectorType>(LeafNodes.front()->getType())
- : cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21593,19 +21590,6 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
- Ops.append({LI->getPointerOperand(), VL});
- Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
- //-----------
- if (Factor == 2) {
- DI->replaceAllUsesWith(Vlseg);
- return true;
- }
- for (unsigned I = 0; I < LeafNodes.size(); I++) {
- auto CurrentExtract = LeafNodes[I];
- Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
- CurrentExtract->replaceAllUsesWith(NewExtract);
- }
- return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21616,8 +21600,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
return true;
}
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21625,10 +21609,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
return false;
- unsigned Factor = LeafNodes.size();
+ unsigned Factor = 2;
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
+ VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21654,11 +21638,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
- SmallVector<Value *> Args(LeafNodes);
- Args.push_back(SI->getPointerOperand());
- Args.push_back(VL);
- Builder.CreateCall(VssegNFunc, Args);
- return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 8ae43b02af5bf..3b8eb3c88901a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -875,12 +875,10 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- SmallVector<Value *> &LeafNodes,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
deleted file mode 100644
index 606bb93e309e1..0000000000000
--- a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: cntw x10
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: mov w9, #1024 // =0x400
-; CHECK-NEXT: neg x10, x10
-; CHECK-NEXT: rdvl x11, #4
-; CHECK-NEXT: .LBB0_1: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x12, x1, x8
-; CHECK-NEXT: adds x9, x9, x10
-; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x12]
-; CHECK-NEXT: add x12, x2, x8
-; CHECK-NEXT: ld4w { z4.s - z7.s }, p0/z, [x12]
-; CHECK-NEXT: add x12, x0, x8
-; CHECK-NEXT: add x8, x8, x11
-; CHECK-NEXT: add z16.s, z4.s, z0.s
-; CHECK-NEXT: sub z17.s, z1.s, z5.s
-; CHECK-NEXT: movprfx z18, z2
-; CHECK-NEXT: lsl z18.s, p0/m, z18.s, z6.s
-; CHECK-NEXT: movprfx z19, z3
-; CHECK-NEXT: asr z19.s, p0/m, z19.s, z7.s
-; CHECK-NEXT: st4w { z16.s - z19.s }, p0, [x12]
-; CHECK-NEXT: b.ne .LBB0_1
-; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT: ret
-entry:
- %0 = tail call i64 @llvm.vscale.i64()
- %1 = shl nuw nsw i64 %0, 2
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
- %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
- %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
- %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
- %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
- %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
- %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
- %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
- %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
- %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
- %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
- %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
- %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
- %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
- %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
- %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
- %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
- %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
- %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
- %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
- %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
- %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
- %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
- %16 = add nsw <vscale x 4 x i32> %12, %5
- %17 = sub nsw <vscale x 4 x i32> %7, %14
- %18 = shl <vscale x 4 x i32> %6, %13
- %19 = ashr <vscale x 4 x i32> %8, %15
- %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
- %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
- %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
- %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
- store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
- %index.next = add nuw i64 %index, %1
- %21 = icmp eq i64 %index.next, 1024
- br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup: ; preds = %vector.body
- ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
deleted file mode 100644
index dcade71ccb684..0000000000000
--- a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
-; CHECK-LABEL: interleave:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ld2w { z1.s, z2.s }, p0/z, [x1]
-; CHECK-NEXT: ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT: uzp2 z5.s, z1.s, z3.s
-; CHECK-NEXT: uzp1 z6.s, z1.s, z3.s
-; CHECK-NEXT: uzp2 z7.s, z2.s, z4.s
-; CHECK-NEXT: uzp1 z1.s, z2.s, z4.s
-; CHECK-NEXT: add z2.s, z0.s, z6.s
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: lsl z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT: sub z1.s, z1.s, z0.s
-; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z7.s
-; CHECK-NEXT: zip1 z4.s, z2.s, z3.s
-; CHECK-NEXT: zip2 z2.s, z2.s, z3.s
-; CHECK-NEXT: zip1 z5.s, z1.s, z0.s
-; CHECK-NEXT: zip2 z3.s, z1.s, z0.s
-; CHECK-NEXT: st2w { z4.s, z5.s }, p0, [x0]
-; CHECK-NEXT: st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT: ret
- %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
- %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
- %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
- %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
- %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
- %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
- %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
- %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
- %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
- %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
- %9 = add nsw <vscale x 4 x i32> %x, %5
- %10 = sub nsw <vscale x 4 x i32> %7, %x
- %11 = shl <vscale x 4 x i32> %6, %x
- %12 = ashr <vscale x 4 x i32> %8, %x
- %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
- %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
- %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
- store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
- ret void
-}
-
-define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
-; CHECK-LABEL: wide_interleave:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ld2w { z2.s, z3.s }, p0/z, [x1]
-; CHECK-NEXT: ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT: ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
-; CHECK-NEXT: ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
-; CHECK-NEXT: uzp2 z26.s, z2.s, z4.s
-; CHECK-NEXT: uzp1 z27.s, z2.s, z4.s
-; CHECK-NEXT: uzp2 z28.s, z3.s, z5.s
-; CHECK-NEXT: uzp1 z2.s, z3.s, z5.s
-; CHECK-NEXT: add z3.s, z0.s, z27.s
-; CHECK-NEXT: movprfx z4, z26
-; CHECK-NEXT: lsl z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT: sub z2.s, z2.s, z0.s
-; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z28.s
-; CHECK-NEXT: zip1 z26.s, z3.s, z4.s
-; CHECK-NEXT: zip2 z3.s, z3.s, z4.s
-; CHECK-NEXT: zip1 z27.s, z2.s, z0.s
-; CHECK-NEXT: zip2 z4.s, z2.s, z0.s
-; CHECK-NEXT: uzp2 z0.s, z6.s, z24.s
-; CHECK-NEXT: uzp1 z2.s, z6.s, z24.s
-; CHECK-NEXT: st2w { z26.s, z27.s }, p0, [x0]
-; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: add z2.s, z1.s, z2.s
-; CHECK-NEXT: st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT: uzp2 z3.s, z7.s, z25.s
-; CHECK-NEXT: uzp1 z4.s, z7.s, z25.s
-; CHECK-NEXT: zip1 z5.s, z2.s, z0.s
-; CHECK-NEXT: sub z4.s, z4.s, z1.s
-; CHECK-NEXT: asrr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT: zip2 z2.s, z2.s, z0.s
-; CHECK-NEXT: zip1 z6.s, z4.s, z1.s
-; CHECK-NEXT: zip2 z3.s, z4.s, z1.s
-; CHECK-NEXT: st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
-; CHECK-NEXT: st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
-; CHECK-NEXT: ret
- %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
- %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
- %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
- %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
- %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
- %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
- %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
- %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
- %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
- %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
- %9 = add nsw <vscale x 8 x i32> %x, %5
- %10 = sub nsw <vscale x 8 x i32> %7, %x
- %11 = shl <vscale x 8 x i32> %6, %x
- %12 = ashr <vscale x 8 x i32> %8, %x
- %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
- %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
- %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
- store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
deleted file mode 100644
index 2ea14b13265c6..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: srli a3, a4, 1
-; CHECK-NEXT: slli a4, a4, 3
-; CHECK-NEXT: li a5, 1024
-; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma
-; CHECK-NEXT: .LBB0_1: # %vector.body
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vlseg4e32.v v8, (a1)
-; CHECK-NEXT: vlseg4e32.v v16, (a2)
-; CHECK-NEXT: vadd.vv v8, v16, v8
-; CHECK-NEXT: vsub.vv v10, v10, v18
-; CHECK-NEXT: vsll.vv v12, v12, v20
-; CHECK-NEXT: vsra.vv v14, v14, v22
-; CHECK-NEXT: vsseg4e32.v v8, (a0)
-; CHECK-NEXT: sub a5, a5, a3
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a2, a2, a4
-; CHECK-NEXT: add a1, a1, a4
-; CHECK-NEXT: bnez a5, .LBB0_1
-; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: ret
-entry:
- %0 = tail call i64 @llvm.vscale.i64()
- %1 = shl nuw nsw i64 %0, 2
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
- %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
- %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
- %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
- %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
- %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
- %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
- %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
- %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
- %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
- %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
- %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
- %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
- %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
- %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
- %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
- %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
- %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
- %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
- %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
- %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
- %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
- %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
- %16 = add nsw <vscale x 4 x i32> %12, %5
- %17 = sub nsw <vscale x 4 x i32> %7, %14
- %18 = shl <vscale x 4 x i32> %6, %13
- %19 = ashr <vscale x 4 x i32> %8, %15
- %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
- %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
- %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
- %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
- store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
- %index.next = add nuw i64 %index, %1
- %21 = icmp eq i64 %index.next, 1024
- br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup: ; preds = %vector.body
- ret void
-}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
new file mode 100644
index 0000000000000..d6d0e98edb3c8
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: ret void
+;
+
+ %load = load <vscale x 16 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+ ret void
+}
+
+define void @wide_deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @wide_deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 4
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP10]])
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP13]], i64 4)
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
+; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT: [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT: ret void
+;
+ %load = load <vscale x 32 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+ %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+ %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+ %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
+ ret void
+}
+
+define void @mix_deinterleave4_deinterleave2(ptr %src) {
+; CHECK-LABEL: define void @mix_deinterleave4_deinterleave2
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: ret void
+;
+
+ %load = load <vscale x 16 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+ %load1 = load <vscale x 8 x i32>, ptr %src, align 4
+ %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load1)
+ %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
+ %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+ ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
new file mode 100644
index 0000000000000..9e38172aaeff0
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+ store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
+ ret void
+}
+
+define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
+; CHECK-LABEL: define void @wide_interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
+; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 0)
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 4
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 4)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 4)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 4)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
+ %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+ store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
+ ret void
+}
+
+define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @mix_interleave4_interleave2
+; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+ store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
+
+ %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ store <vscale x 8 x i32> %interleaved, ptr %dst2, align 4
+ ret void
+}
>From 6b0cbeee483a3ec10154a44d2fa869205ab084e7 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 13 Jun 2024 14:45:35 +0000
Subject: [PATCH 5/6] add DeadCodeElim pass to the RUN line
Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e
---
.../InterleavedAccess/AArch64/sve-interleave4.ll | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 9e38172aaeff0..6cbd201ab36a2 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -1,12 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+; RUN: opt < %s -passes=interleaved-access,dce -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
; CHECK-LABEL: define void @interleave4
; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
; CHECK-NEXT: ret void
;
@@ -20,8 +18,6 @@ define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
; CHECK-LABEL: define void @wide_interleave4
; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
-; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
@@ -46,8 +42,6 @@ define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32
define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
; CHECK-LABEL: define void @mix_interleave4_interleave2
; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT: [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
; CHECK-NEXT: ret void
>From e5154bd575c4d94aa03153eed04211984423df7a Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 18 Jun 2024 17:10:52 +0000
Subject: [PATCH 6/6] Fix assmuption of the extraction order, make it generic
then make sure of the order using pattern match
Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a
---
.../Target/AArch64/AArch64ISelLowering.cpp | 110 ++++++++++--------
1 file changed, 61 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4985907f7489b..db8da3eb372bd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16586,67 +16586,80 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool getDeinterleavedValues(Value *DI,
- SmallVectorImpl<Value *> &DeinterleavedValues,
- SmallVectorImpl<Instruction *> &DeadInsts) {
- if (!DI->hasNUses(2))
+ SmallVectorImpl<Instruction *> &DeinterleavedValues) {
+ if (!DI->hasNUsesOrMore(2))
return false;
-
- // make sure that the users of DI are extractValue instructions
- auto *Extr0 = *(++DI->user_begin());
- if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
- return false;
- auto *Extr1 = *(DI->user_begin());
- if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+ auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+ auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+ if (!Extr1 || !Extr2)
return false;
- // each extractValue instruction is expected to have a single user,
- // which should be another DI
- if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+ if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
return false;
- auto *DI1 = *(Extr0->user_begin());
- if (!match(DI1, m_Deinterleave2(m_Value())))
+ auto *DI1 = *(Extr1->user_begin());
+ auto *DI2 = *(Extr2->user_begin());
+
+ if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))
return false;
- auto *DI2 = *(Extr1->user_begin());
- if (!match(DI2, m_Deinterleave2(m_Value())))
+ // Leaf nodes of the deinterleave tree:
+ auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
+ auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+ auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+ auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
+ // Make sure that the A,B,C,D are instructions of ExtractValue,
+ // before getting the extract index
+ if (!A || !B || !C || !D)
return false;
- if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+ DeinterleavedValues.resize(4);
+ // Place the values into the vector in the order of extraction:
+ DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
+ DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
+ DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
+ DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
+
+ // Make sure that A,B,C,D match the deinterleave tree pattern
+ if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
+ m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+ !match(DeinterleavedValues[1], m_ExtractValue<1>(m_Deinterleave2(
+ m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+ !match(DeinterleavedValues[2], m_ExtractValue<0>(m_Deinterleave2(
+ m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
+ !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
+ m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
+ LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
return false;
-
- // Leaf nodes of the deinterleave tree
- auto *A = *(++DI1->user_begin());
- auto *C = *(DI1->user_begin());
- auto *B = *(++DI2->user_begin());
- auto *D = *(DI2->user_begin());
-
- DeinterleavedValues.push_back(A);
- DeinterleavedValues.push_back(B);
- DeinterleavedValues.push_back(C);
- DeinterleavedValues.push_back(D);
-
- // These Values will not be used anymre,
- // DI4 will be created instead of nested DI1 and DI2
- DeadInsts.push_back(cast<Instruction>(DI1));
- DeadInsts.push_back(cast<Instruction>(Extr0));
- DeadInsts.push_back(cast<Instruction>(DI2));
- DeadInsts.push_back(cast<Instruction>(Extr1));
-
+ }
+ // Order the values according to the deinterleaving order.
+ std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
return true;
}
+void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
+ Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
+ match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
+ assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
+ match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
+ assert(DeadExtract != nullptr && "Match is expected to succeed");
+ DeadRoot->eraseFromParent();
+ if (DeadDeinterleave->getNumUses() == 0)
+ cast<Instruction>(DeadDeinterleave)->eraseFromParent();
+ if (DeadExtract->getNumUses() == 0)
+ cast<Instruction>(DeadExtract)->eraseFromParent();
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
IntrinsicInst *DI, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- SmallVector<Value *, 4> DeinterleavedValues;
- SmallVector<Instruction *, 10> DeadInsts;
+ SmallVector<Instruction *, 4> DeinterleavedValues;
const DataLayout &DL = DI->getModule()->getDataLayout();
unsigned Factor = 2;
VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
- if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+ if (getDeinterleavedValues(DI, DeinterleavedValues)) {
Factor = DeinterleavedValues.size();
VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
}
@@ -16693,7 +16706,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
Value *Idx =
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
- for (int J = 0; J < Factor; ++J) {
+ for (unsigned J = 0; J < Factor; ++J) {
WideValues[J] = Builder.CreateInsertVector(
VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
}
@@ -16703,7 +16716,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
else
Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
// Construct the wide result out of the small results.
- for (int J = 0; J < Factor; ++J) {
+ for (unsigned J = 0; J < Factor; ++J) {
Result = Builder.CreateInsertValue(Result, WideValues[J], J);
}
} else {
@@ -16713,15 +16726,14 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
if (Factor > 2) {
+ // Itereate over old deinterleaved values to replace it by
+ // the new deinterleaved values.
for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
- llvm::Value *CurrentExtract = DeinterleavedValues[I];
Value *NewExtract = Builder.CreateExtractValue(Result, I);
- CurrentExtract->replaceAllUsesWith(NewExtract);
- cast<Instruction>(CurrentExtract)->eraseFromParent();
+ DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
}
-
- for (auto &dead : DeadInsts)
- dead->eraseFromParent();
+ for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
+ deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
return true;
}
DI->replaceAllUsesWith(Result);
@@ -16803,7 +16815,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
Value *Idx =
Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
- for (int J = 0; J < Factor; J++) {
+ for (unsigned J = 0; J < Factor; J++) {
ValuesToInterleave[J] =
Builder.CreateExtractVector(StTy, WideValues[J], Idx);
}
More information about the llvm-commits
mailing list