[llvm] [IA]: Construct (de)interleave4 out of (de)interleave2 (PR #89276)
Hassnaa Hamdi via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 28 22:04:23 PDT 2024
https://github.com/hassnaaHamdi updated https://github.com/llvm/llvm-project/pull/89276
>From 7c19cad16025dfae5efb1eab1110af46351835ba Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 18 Apr 2024 17:30:51 +0000
Subject: [PATCH 1/2] [IA]: Construct (de)interleave4 out of (de)interleave2
- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences,
and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV.
- Tests are added for targets that support SV.
Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 83 +++++++++++++++++--
.../Target/AArch64/AArch64ISelLowering.cpp | 40 ++++++---
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 37 +++++++--
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +-
.../CodeGen/AArch64/sve-deinterleave-load.ll | 78 +++++++++++++++++
.../RISCV/rvv/sve-deinterleave-load.ll | 74 +++++++++++++++++
8 files changed, 294 insertions(+), 28 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..93218ad22bdd48 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -56,6 +56,8 @@
#include <cstdint>
#include <iterator>
#include <map>
+#include <queue>
+#include <stack>
#include <string>
#include <utility>
#include <vector>
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..33501cbf132e5d 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,6 +70,7 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
+#include <queue>
#include <utility>
using namespace llvm;
@@ -510,12 +511,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
+ SmallVector<Value *> TempLeafNodes, LeafNodes;
+ std::map<IntrinsicInst *, bool> mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ while (!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ // iterate over extract users of deinterleave
+ for (auto UserExtract : CurrentDI->users()) {
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ // iterate over deinterleave users of extract
+ for (auto UserDI : UserExtract->users()) {
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI || Child_DI->getIntrinsicID() !=
+ Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ TempLeafNodes.push_back(UserExtract);
+ TempDeadInsts.push_back(Extract);
+ } else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ }
+ // sort the deinterleaved nodes in the order that
+ // they will be extracted from the target-specific intrinsic.
+ for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
+ for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
+ TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +577,38 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
+ SmallVector<Value *> TempLeafNodes, LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while (!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for (unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() !=
+ Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ TempLeafNodes.push_back(op);
+ }
+ }
+ for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+ for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +629,8 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() ==
+ Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..5b8193372d02cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front()->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,9 +16410,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
- else
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ for (unsigned I = 0; I < LeafNodes.size(); I++) {
+ llvm::Value *CurrentExtract = LeafNodes[I];
+ Value *Newextrct = Builder.CreateExtractValue(Result, I);
+ CurrentExtract->replaceAllUsesWith(Newextrct);
+ }
+ return true;
+ } else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16431,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,9 +16484,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
- else
+ if (UseScalable) {
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ } else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..4863e7aa0a1619 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..96ae0de5b2f6b8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21024,8 +21024,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21033,13 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy =
+ (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front()->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21067,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ for (unsigned I = 0; I < LeafNodes.size(); I++) {
+ auto CurrentExtract = LeafNodes[I];
+ Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+ CurrentExtract->replaceAllUsesWith(NewExtract);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21074,8 +21090,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
return true;
}
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21099,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21128,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..05becb05625f2e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value *> &LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value *> &LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..606bb93e309e12
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x10
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w9, #1024 // =0x400
+; CHECK-NEXT: neg x10, x10
+; CHECK-NEXT: rdvl x11, #4
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x12, x1, x8
+; CHECK-NEXT: adds x9, x9, x10
+; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x2, x8
+; CHECK-NEXT: ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x0, x8
+; CHECK-NEXT: add x8, x8, x11
+; CHECK-NEXT: add z16.s, z4.s, z0.s
+; CHECK-NEXT: sub z17.s, z1.s, z5.s
+; CHECK-NEXT: movprfx z18, z2
+; CHECK-NEXT: lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT: movprfx z19, z3
+; CHECK-NEXT: asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT: st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+ %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+ %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+ %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+ %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+ %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+ %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+ %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+ %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+ %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+ %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+ %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+ %16 = add nsw <vscale x 4 x i32> %12, %5
+ %17 = sub nsw <vscale x 4 x i32> %7, %14
+ %18 = shl <vscale x 4 x i32> %6, %13
+ %19 = ashr <vscale x 4 x i32> %8, %15
+ %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+ %index.next = add nuw i64 %index, %1
+ %21 = icmp eq i64 %index.next, 1024
+ br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..2ea14b13265c61
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: srli a3, a4, 1
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: li a5, 1024
+; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma
+; CHECK-NEXT: .LBB0_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vlseg4e32.v v8, (a1)
+; CHECK-NEXT: vlseg4e32.v v16, (a2)
+; CHECK-NEXT: vadd.vv v8, v16, v8
+; CHECK-NEXT: vsub.vv v10, v10, v18
+; CHECK-NEXT: vsll.vv v12, v12, v20
+; CHECK-NEXT: vsra.vv v14, v14, v22
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: sub a5, a5, a3
+; CHECK-NEXT: add a0, a0, a4
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: bnez a5, .LBB0_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+ %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+ %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+ %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+ %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+ %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+ %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+ %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+ %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+ %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+ %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+ %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+ %16 = add nsw <vscale x 4 x i32> %12, %5
+ %17 = sub nsw <vscale x 4 x i32> %7, %14
+ %18 = shl <vscale x 4 x i32> %6, %13
+ %19 = ashr <vscale x 4 x i32> %8, %15
+ %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+ %index.next = add nuw i64 %index, %1
+ %21 = icmp eq i64 %index.next, 1024
+ br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
>From c55d6a02d6123c4b9ca373d8ec2842db5a1171b8 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Mon, 29 Apr 2024 05:03:36 +0000
Subject: [PATCH 2/2] [PatternMatch]: Add m_Interleave and m_Deinterleave
matchers.
Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a
---
llvm/include/llvm/IR/PatternMatch.h | 61 +++++++++++++++++++++++++++++
1 file changed, 61 insertions(+)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 92cb79d54afc29..e788962782756d 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2793,6 +2793,67 @@ inline VScaleVal_match m_VScale() {
return VScaleVal_match();
}
+template <typename LHS, typename RHS>
+struct Interleave2_match {
+ LHS L;
+ RHS R;
+
+ Interleave2_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+ template <typename ITy> bool match(ITy *V) {
+ auto *I = dyn_cast<IntrinsicInst>(V);
+ if (m_Intrinsic<Intrinsic::experimental_vector_interleave2>().match(V)) {
+ return (L.match(I->getOperand(0))
+ && R.match(I->getOperand(1)));
+ }
+ return false;
+ }
+};
+
+template <typename LHS, typename RHS>
+inline Interleave2_match<LHS, RHS> m_Interleave2(const LHS &L, const RHS &R) {
+ return Interleave2_match<LHS, RHS>(L, R);
+}
+
+// Match deinterleave tree.
+// if the current user of deinterelave is the last (extract instr) in the tree,
+// then match for that user.
+// otherwise, it means there are still deinterleave nodes in the tree,
+// then match for the next deinterleave in the tree,
+// which is the user of the extract.
+template <typename LHS, typename RHS>
+struct Deinterleave2_match {
+ LHS L;
+ RHS R;
+
+ Deinterleave2_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+ template <typename ITy> bool match(ITy *V) {
+ auto *I = dyn_cast<IntrinsicInst>(V);
+ if (m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>().match(V)) {
+ if(!I->hasNUses(2))
+ return false;
+
+ User* UserI1 = *I->user_begin();
+ User* UserI0 = *(++I->user_begin());
+
+ if (!PatternMatch::match(UserI1, m_ExtractValue<1>(m_Value())) ||
+ !PatternMatch::match(UserI0, m_ExtractValue<0>(m_Value())))
+ return false;
+
+ return (L.match(UserI0) && R.match(UserI1)) ||
+ (L.match(*(UserI0->user_begin())) &&
+ R.match(*(UserI1->user_begin())));
+ }
+ return false;
+ }
+};
+
+template <typename LHS, typename RHS>
+inline Deinterleave2_match<LHS, RHS> m_Deinterleave2(const LHS &L, const RHS &R) {
+ return Deinterleave2_match<LHS, RHS>(L, R);
+}
+
template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
struct LogicalOp_match {
LHS L;
More information about the llvm-commits
mailing list