[llvm] [AMDGPU] Accept arbitrary sized sources in CalculateByteProvider (PR #70240)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 6 20:06:28 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/70240
>From 85a17a4439fb4932db48c66cccaa89fb79d45423 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 23 Aug 2023 13:28:00 -0700
Subject: [PATCH 1/2] [AMDGPU] Accept arbitrary sized sources in
CalculateByteProvider
This allows working with e.g. v8i8 / v16i8 sources.
It is generally useful, but is primarily beneficial when allowing e.g. v8i8s to be passed to branches directly through registers. As such, this is the first in a series of patches to enable that work. However, it effects https://reviews.llvm.org/D155995, so it has been implemented on top of that.
Differential Revision: https://reviews.llvm.org/D159036
Change-Id: Idfcb57dacd0c32cab040fe4dd4ac2ec762750664
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 209 ++--
llvm/test/CodeGen/AMDGPU/idot4u.ll | 1087 +++++++++++++++++
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 15 +-
llvm/test/CodeGen/AMDGPU/load-hi16.ll | 36 +-
llvm/test/CodeGen/AMDGPU/permute.ll | 4 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 260 ++++
6 files changed, 1498 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d4adb16a27162..2e2ae4148f6f26 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11567,8 +11567,7 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;
- auto ValueSize = Op.getValueSizeInBits();
- if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+ if (Op.getValueSizeInBits() < 8)
return std::nullopt;
switch (Op->getOpcode()) {
@@ -11827,8 +11826,6 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
auto VecIdx = IdxOp->getZExtValue();
auto ScalarSize = Op.getScalarValueSizeInBits();
if (ScalarSize != 32) {
- if ((VecIdx + 1) * ScalarSize > 32)
- return std::nullopt;
Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
}
@@ -11913,9 +11910,6 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
- assert(Op.getValueType().isByteSized());
- assert(OtherOp.getValueType().isByteSized());
-
auto TempOp = peekThroughBitcasts(Op);
auto TempOtherOp = peekThroughBitcasts(OtherOp);
@@ -11933,15 +11927,38 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
}
+static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
+ unsigned DWordOffset) {
+ SDValue Ret;
+ if (Src.getValueSizeInBits() <= 32)
+ return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
+
+ if (Src.getValueSizeInBits() >= 256) {
+ assert(!(Src.getValueSizeInBits() % 32));
+ Ret = DAG.getBitcast(
+ MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
+ DAG.getConstant(DWordOffset, SL, MVT::i32));
+ }
+
+ Ret = DAG.getBitcastedAnyExtOrTrunc(
+ Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
+ if (DWordOffset) {
+ auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
+ DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
+ }
+
+ return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+}
+
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
-
- if (VT != MVT::i32)
- return SDValue();
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
// VT is known to be MVT::i32, so we need to provide 4 bytes.
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
+ assert(VT == MVT::i32);
for (int i = 0; i < 4; i++) {
// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =
@@ -11955,8 +11972,8 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
if (PermNodes.size() != 4)
return SDValue();
- int FirstSrc = 0;
- std::optional<int> SecondSrc;
+ std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
+ std::optional<std::pair<unsigned, unsigned>> SecondSrc;
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
@@ -11964,33 +11981,31 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// by sizeof(Src2) = 4
int SrcByteAdjust = 4;
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
- if (SecondSrc.has_value())
- if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
+ // If the Src uses a byte from a different DWORD, then it corresponds
+ // with a difference source
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
+ ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
+ if (SecondSrc)
+ if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
+ ((PermOp.SrcOffset / 4) != SecondSrc->second))
return SDValue();
// Set the index of the second distinct Src node
- SecondSrc = i;
- assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
+ SecondSrc = {i, PermNodes[i].SrcOffset / 4};
+ assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
SrcByteAdjust = 0;
}
- assert(PermOp.SrcOffset + SrcByteAdjust < 8);
+ assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());
- PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+ PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
}
-
- SDValue Op = *PermNodes[FirstSrc].Src;
- SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
- : *PermNodes[FirstSrc].Src;
-
- // Check that we haven't just recreated the same FSHR node.
- if (N->getOpcode() == ISD::FSHR &&
- (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
- (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
- return SDValue();
+ SDLoc DL(N);
+ SDValue Op = *PermNodes[FirstSrc.first].Src;
+ Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
+ assert(Op.getValueSizeInBits() == 32);
// Check that we are not just extracting the bytes in order from an op
- if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
+ if (!SecondSrc) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
@@ -12002,8 +12017,16 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
}
+ SDValue OtherOp =
+ SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
+
+ if (SecondSrc)
+ OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
+
+ assert(Op.getValueSizeInBits() == 32);
+
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
- SDLoc DL(N);
+
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
@@ -12018,7 +12041,6 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(PermMask, DL, MVT::i32));
}
-
return SDValue();
}
@@ -13530,17 +13552,24 @@ static unsigned addPermMasks(unsigned First, unsigned Second) {
return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
}
+struct DotSrc {
+ SDValue SrcOp;
+ int64_t PermMask;
+ int64_t DWordOffset;
+};
+
static void placeSources(ByteProvider<SDValue> &Src0,
ByteProvider<SDValue> &Src1,
- SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
- SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
- int Step) {
+ SmallVectorImpl<DotSrc> &Src0s,
+ SmallVectorImpl<DotSrc> &Src1s, int Step) {
assert(Src0.Src.has_value() && Src1.Src.has_value());
// Src0s and Src1s are empty, just place arbitrarily.
if (Step == 0) {
- Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
- Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+ Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
+ Src0.SrcOffset / 4});
+ Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
+ Src1.SrcOffset / 4});
return;
}
@@ -13553,38 +13582,38 @@ static void placeSources(ByteProvider<SDValue> &Src0,
unsigned FMask = 0xFF << (8 * (3 - Step));
unsigned FirstMask =
- BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
unsigned SecondMask =
- BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
// Attempt to find Src vector which contains our SDValue, if so, add our
// perm mask to the existing one. If we are unable to find a match for the
// first SDValue, attempt to find match for the second.
int FirstGroup = -1;
for (int I = 0; I < 2; I++) {
- SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
- I == 0 ? Src0s : Src1s;
- auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
- return IterElt.first == *BPP.first.Src;
+ SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
+ auto MatchesFirst = [&BPP](DotSrc &IterElt) {
+ return IterElt.SrcOp == *BPP.first.Src &&
+ (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
};
auto Match = llvm::find_if(Srcs, MatchesFirst);
if (Match != Srcs.end()) {
- Match->second = addPermMasks(FirstMask, Match->second);
+ Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
FirstGroup = I;
break;
}
}
if (FirstGroup != -1) {
- SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
- FirstGroup == 1 ? Src0s : Src1s;
- auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
- return IterElt.first == *BPP.second.Src;
+ SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
+ auto MatchesSecond = [&BPP](DotSrc &IterElt) {
+ return IterElt.SrcOp == *BPP.second.Src &&
+ (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
};
auto Match = llvm::find_if(Srcs, MatchesSecond);
if (Match != Srcs.end()) {
- Match->second = addPermMasks(SecondMask, Match->second);
+ Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
} else
- Srcs.push_back({*BPP.second.Src, SecondMask});
+ Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
return;
}
}
@@ -13596,29 +13625,32 @@ static void placeSources(ByteProvider<SDValue> &Src0,
unsigned FMask = 0xFF << (8 * (3 - Step));
Src0s.push_back(
- {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+ {*Src0.Src,
+ ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
+ Src1.SrcOffset / 4});
Src1s.push_back(
- {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+ {*Src1.Src,
+ ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
+ Src1.SrcOffset / 4});
return;
}
-static SDValue
-resolveSources(SelectionDAG &DAG, SDLoc SL,
- SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
- bool IsSigned, bool IsAny) {
+static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
+ SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
+ bool IsAny) {
// If we just have one source, just permute it accordingly.
if (Srcs.size() == 1) {
auto Elt = Srcs.begin();
- auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+ auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
- // v_perm will produce the original value.
- if (Elt->second == 0x3020100)
- return EltVal;
+ // v_perm will produce the original value
+ if (Elt->PermMask == 0x3020100)
+ return EltOp;
- return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
- DAG.getConstant(Elt->second, SL, MVT::i32));
+ return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
+ DAG.getConstant(Elt->PermMask, SL, MVT::i32));
}
auto FirstElt = Srcs.begin();
@@ -13629,8 +13661,8 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
// If we have multiple sources in the chain, combine them via perms (using
// calculated perm mask) and Ors.
while (true) {
- auto FirstMask = FirstElt->second;
- auto SecondMask = SecondElt->second;
+ auto FirstMask = FirstElt->PermMask;
+ auto SecondMask = SecondElt->PermMask;
unsigned FirstCs = FirstMask & 0x0c0c0c0c;
unsigned FirstPlusFour = FirstMask | 0x04040404;
@@ -13640,9 +13672,9 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
auto PermMask = addPermMasks(FirstMask, SecondMask);
auto FirstVal =
- DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+ getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
auto SecondVal =
- DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+ getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
SecondVal,
@@ -13656,12 +13688,12 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
// If we only have a FirstElt, then just combine that into the cumulative
// source node.
if (SecondElt == Srcs.end()) {
- auto EltVal =
- DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+ auto EltOp =
+ getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
Perms.push_back(
- DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
- DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+ DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
+ DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
break;
}
}
@@ -13672,9 +13704,8 @@ resolveSources(SelectionDAG &DAG, SDLoc SL,
: Perms[0];
}
-static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
- unsigned ChainLength) {
- for (auto &[EntryVal, EntryMask] : Srcs) {
+static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
+ for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
EntryMask = EntryMask >> ((4 - ChainLength) * 8);
auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
EntryMask += ZeroMask;
@@ -13774,8 +13805,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
SDValue TempNode(N, 0);
std::optional<bool> IsSigned;
- SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
- SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+ SmallVector<DotSrc, 4> Src0s;
+ SmallVector<DotSrc, 4> Src1s;
SmallVector<SDValue, 4> Src2s;
// Match the v_dot4 tree, while collecting src nodes.
@@ -13857,11 +13888,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
// (commutation).
bool UseOriginalSrc = false;
if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
- Src0s.begin()->second == Src1s.begin()->second &&
- Src0s.begin()->first.getValueSizeInBits() == 32 &&
- Src1s.begin()->first.getValueSizeInBits() == 32) {
+ Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
+ Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
+ Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
SmallVector<unsigned, 4> SrcBytes;
- auto Src0Mask = Src0s.begin()->second;
+ auto Src0Mask = Src0s.begin()->PermMask;
SrcBytes.push_back(Src0Mask & 0xFF000000);
bool UniqueEntries = true;
for (auto I = 1; I < 4; I++) {
@@ -13876,11 +13907,19 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
if (UniqueEntries) {
UseOriginalSrc = true;
- // Must be 32 bits to enter above conditional.
- assert(Src0s.begin()->first.getValueSizeInBits() == 32);
- assert(Src1s.begin()->first.getValueSizeInBits() == 32);
- Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
- Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+
+ auto FirstElt = Src0s.begin();
+ auto FirstEltOp =
+ getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
+
+ auto SecondElt = Src1s.begin();
+ auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
+ SecondElt->DWordOffset);
+
+ Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
+ MVT::getIntegerVT(32));
+ Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
+ MVT::getIntegerVT(32));
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index ba9ccb4c636f94..9a1de74034cd83 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -4884,6 +4884,1093 @@ entry:
ret void
}
+define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_hilo:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 8
+; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_hilo:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2
+; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_hilo:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_hilo:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_hilo:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_hilo:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_clause 0x1
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+ %v1e0 = extractelement <8 x i8> %vec1, i64 4
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec2, i64 0
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <8 x i8> %vec1, i64 5
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec2, i64 1
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <8 x i8> %vec1, i64 6
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec2, i64 2
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <8 x i8> %vec1, i64 7
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec2, i64 3
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_lohi:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
+; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_lohi:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX8-NEXT: v_bfe_u32 v8, v2, 8, 8
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX8-NEXT: v_mad_u32_u24 v3, v5, v8, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, v6, v2, v3
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_lohi:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_lohi:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_lohi:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x10302
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_lohi:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_clause 0x1
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x10302
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+ %v1e0 = extractelement <8 x i8> %vec1, i64 0
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec2, i64 7
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <8 x i8> %vec1, i64 1
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec2, i64 6
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <8 x i8> %vec1, i64 2
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec2, i64 5
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <8 x i8> %vec1, i64 3
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec2, i64 4
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_hihi:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v5, v0, 16, 8
+; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_hihi:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8
+; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
+; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_hihi:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 16, 8
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
+; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_hihi:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_hihi:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4
+; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x1030200
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_hihi:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_clause 0x1
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:4
+; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x1030200
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+ %v1e0 = extractelement <8 x i8> %vec1, i64 4
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec2, i64 6
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <8 x i8> %vec1, i64 6
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec2, i64 4
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <8 x i8> %vec1, i64 5
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec2, i64 7
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <8 x i8> %vec1, i64 7
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec2, i64 5
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v8i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8
+; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2
+; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v8i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 8
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v8i8:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v3, v4, v5
+; GFX9-NODL-NEXT: v_add3_u32 v0, v1, v6, v0
+; GFX9-NODL-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v8i8:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v8i8:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v8i8:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+
+
+ %v1e0 = extractelement <8 x i8> %vec1, i64 0
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec1, i64 4
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <8 x i8> %vec1, i64 1
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec1, i64 5
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <8 x i8> %vec1, i64 2
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec1, i64 6
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <8 x i8> %vec1, i64 3
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec1, i64 7
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v16i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0
+; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v2, v2, v5
+; GFX7-NEXT: v_bfe_u32 v6, v3, 8, 8
+; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v16i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2]
+; GFX8-NEXT: flat_load_dword v4, v[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
+; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 8
+; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 8
+; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v16i8:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NODL-NEXT: ; kill: killed $vgpr5
+; GFX9-NODL-NEXT: ; kill: killed $vgpr4
+; GFX9-NODL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v0, v5, s[6:7]
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v0
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0
+; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v16i8:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7]
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT: ; kill: killed $vgpr5
+; GFX9-DL-NEXT: ; kill: killed $vgpr4
+; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0
+; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v16i8:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: ; kill: killed $vgpr5
+; GFX10-DL-NEXT: ; kill: killed $vgpr4
+; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7]
+; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v16i8:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7]
+; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002
+; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+ %v1e0 = extractelement <16 x i8> %vec1, i64 8
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec2, i64 0
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <16 x i8> %vec1, i64 10
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec2, i64 1
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <16 x i8> %vec1, i64 13
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec2, i64 2
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <16 x i8> %vec1, i64 15
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec2, i64 3
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
+; GFX7-LABEL: idot4_acc32_v256i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 offset:252
+; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v4, v0, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8
+; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2
+; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: idot4_acc32_v256i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT: s_movk_i32 s2, 0xfc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2
+; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
+; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: idot4_acc32_v256i8:
+; GFX9-NODL: ; %bb.0: ; %entry
+; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: global_load_dword v2, v1, s[4:5] offset:252
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v2
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v3
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: idot4_acc32_v256i8:
+; GFX9-DL: ; %bb.0: ; %entry
+; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_v256i8:
+; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252
+; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0x3020001
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_v256i8:
+; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252
+; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x3020001
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: s_nop 0
+; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT: s_endpgm
+ ptr addrspace(1) %src2,
+ ptr addrspace(1) nocapture %dst) {
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <256 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+
+ %v1e0 = extractelement <256 x i8> %vec1, i64 255
+ %cv1e0 = zext i8 %v1e0 to i32
+ %v2e0 = extractelement <8 x i8> %vec2, i64 0
+ %cv2e0 = zext i8 %v2e0 to i32
+ %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
+
+ %v1e1 = extractelement <256 x i8> %vec1, i64 254
+ %cv1e1 = zext i8 %v1e1 to i32
+ %v2e1 = extractelement <8 x i8> %vec2, i64 1
+ %cv2e1 = zext i8 %v2e1 to i32
+ %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
+
+ %v1e2 = extractelement <256 x i8> %vec1, i64 252
+ %cv1e2 = zext i8 %v1e2 to i32
+ %v2e2 = extractelement <8 x i8> %vec2, i64 2
+ %cv2e2 = zext i8 %v2e2 to i32
+ %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
+
+ %v1e3 = extractelement <256 x i8> %vec1, i64 253
+ %cv1e3 = zext i8 %v1e3 to i32
+ %v2e3 = extractelement <8 x i8> %vec2, i64 3
+ %cv2e3 = zext i8 %v2e3 to i32
+ %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
+
+ %add1 = add i32 %mul1, 0
+ %add2 = add i32 %add1, %mul2
+ %add3 = add i32 %add2, %mul3
+ %add4 = add i32 %add3, %mul4
+ store i32 %add4, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX7-LABEL: idot4_acc32_anyext:
; GFX7: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 47f7943e076a4a..1ba2491d2210ec 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2220,14 +2220,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: s_lshl_b32 s1, s4, 16
-; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: v_mov_b32_e32 v6, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_bfi_b32 v3, s2, v3, v3
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -2302,14 +2300,13 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s4
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v6, s4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_bfi_b32 v3, s2, v6, v3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v1
+; VI-NEXT: v_bfi_b32 v3, s0, v6, v3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 64c96c8edf5ee9..0c61c58ef06192 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
index 330e5be6d531f7..6cab2b18393070 100644
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -111,19 +111,21 @@ bb:
ret void
}
+; FIXME: produce v_alignbit_b32 v2, v2, s0, 24 instead of v_perm
define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh8_or_lsr24:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_alignbit_b32 v2, v2, s0, 24
+; GCN-NEXT: v_perm_b32 v2, s0, v2, v3
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 5f896f92de0f42..a8d53856c7c616 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3404,3 +3404,263 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}
+
+define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_hilo:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3060505
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_hilo:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3060505
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e5 = extractelement <8 x i8> %vec1, i64 5
+ %zv1e5 = zext i8 %v1e5 to i32
+ %byte1 = shl i32 %zv1e5, 8
+
+ %v1e6 = extractelement <8 x i8> %vec1, i64 6
+ %zv1e6 = zext i8 %v1e6 to i32
+ %byte2 = shl i32 %zv1e6, 16
+ %v2e3 = extractelement <8 x i8> %vec2, i64 3
+ %zv2e3 = zext i8 %v2e3 to i32
+ %byte3 = shl i32 %zv2e3, 24
+
+ %tmp0 = or i32 %zv1e5, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_lohi:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x70404
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_lohi:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x70404
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <8 x i8> %vec1, i64 0
+ %zv1e0 = zext i8 %v1e0 to i32
+ %byte1 = shl i32 %zv1e0, 8
+
+ %v1e3 = extractelement <8 x i8> %vec1, i64 3
+ %zv1e3 = zext i8 %v1e3 to i32
+ %byte2 = shl i32 %zv1e3, 16
+ %v2e4 = extractelement <8 x i8> %vec2, i64 4
+ %zv2e4 = zext i8 %v2e4 to i32
+ %byte3 = shl i32 %zv2e4, 24
+
+ %tmp0 = or i32 %zv1e0, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_hihi:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2070505
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_hihi:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x2070505
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e5 = extractelement <8 x i8> %vec1, i64 5
+ %zv1e5 = zext i8 %v1e5 to i32
+ %byte1 = shl i32 %zv1e5, 8
+
+ %v1e7 = extractelement <8 x i8> %vec1, i64 7
+ %zv1e7 = zext i8 %v1e7 to i32
+ %byte2 = shl i32 %zv1e7, 16
+ %v2e6 = extractelement <8 x i8> %vec2, i64 6
+ %zv2e6 = zext i8 %v2e6 to i32
+ %byte3 = shl i32 %zv2e6, 24
+
+ %tmp0 = or i32 %zv1e5, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_v8i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404
+; GFX10-NEXT: global_store_dword v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v8i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1070404
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+ %v1e4 = extractelement <8 x i8> %vec1, i64 4
+ %zv1e4 = zext i8 %v1e4 to i32
+ %byte1 = shl i32 %zv1e4, 8
+
+ %v1e7 = extractelement <8 x i8> %vec1, i64 7
+ %zv1e7 = zext i8 %v1e7 to i32
+ %byte2 = shl i32 %zv1e7, 16
+ %v2e1 = extractelement <8 x i8> %vec1, i64 1
+ %zv2e1 = zext i8 %v2e1 to i32
+ %byte3 = shl i32 %zv2e1, 24
+
+ %tmp0 = or i32 %zv1e4, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_v256i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:252
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6050707
+; GFX10-NEXT: global_store_dword v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v256i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:252
+; GFX9-NEXT: s_mov_b32 s4, 0x6050707
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4
+ %v1e4 = extractelement <256 x i8> %vec1, i64 255
+ %zv1e4 = zext i8 %v1e4 to i32
+ %byte1 = shl i32 %zv1e4, 8
+
+ %v1e7 = extractelement <256 x i8> %vec1, i64 253
+ %zv1e7 = zext i8 %v1e7 to i32
+ %byte2 = shl i32 %zv1e7, 16
+ %v2e1 = extractelement <256 x i8> %vec1, i64 254
+ %zv2e1 = zext i8 %v2e1 to i32
+ %byte3 = shl i32 %zv2e1, 24
+
+ %tmp0 = or i32 %zv1e4, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+; TODO : support this pattern
+define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract_3src:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2
+; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_3src:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
+; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <8 x i8> %vec1, i64 0
+ %zv1e0 = zext i8 %v1e0 to i32
+ %byte1 = shl i32 %zv1e0, 8
+
+ %v1e5 = extractelement <8 x i8> %vec1, i64 5
+ %zv1e5 = zext i8 %v1e5 to i32
+ %byte2 = shl i32 %zv1e5, 16
+ %v2e6 = extractelement <8 x i8> %vec2, i64 6
+ %zv2e6 = zext i8 %v2e6 to i32
+ %byte3 = shl i32 %zv2e6, 24
+
+ %tmp0 = or i32 %zv1e0, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
>From 7468bfdb8bd20d34cead95ab6f411c5d4dcb13ce Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 24 Oct 2023 17:52:01 -0700
Subject: [PATCH 2/2] [AMDGPU] Make getDWordFromOffset robust against exotic
types + handle vectors in CalcByteProvider
Change-Id: I88775857394ac698e25ca1b89d7092d1dee50c33
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 105 +++++++--
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 272 ++++++++++++++++++++--
2 files changed, 337 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e2ae4148f6f26..4faf8087751124 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11570,6 +11570,9 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Op.getValueSizeInBits() < 8)
return std::nullopt;
+ if (Op.getValueType().isVector())
+ return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+
switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
@@ -11636,8 +11639,12 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
if (Index > BitWidth / 8 - 1)
return std::nullopt;
+ bool IsVec = Op.getValueType().isVector();
switch (Op.getOpcode()) {
case ISD::OR: {
+ if (IsVec)
+ return std::nullopt;
+
auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
StartingIndex);
if (!RHS)
@@ -11658,6 +11665,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::AND: {
+ if (IsVec)
+ return std::nullopt;
+
auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!BitMaskOp)
return std::nullopt;
@@ -11678,6 +11688,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::FSHR: {
+ if (IsVec)
+ return std::nullopt;
+
// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
if (!ShiftOp || Op.getValueType().isVector())
@@ -11703,6 +11716,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
case ISD::SRA:
case ISD::SRL: {
+ if (IsVec)
+ return std::nullopt;
+
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
return std::nullopt;
@@ -11728,6 +11744,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::SHL: {
+ if (IsVec)
+ return std::nullopt;
+
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
return std::nullopt;
@@ -11752,6 +11771,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
case ISD::SIGN_EXTEND_INREG:
case ISD::AssertZext:
case ISD::AssertSext: {
+ if (IsVec)
+ return std::nullopt;
+
SDValue NarrowOp = Op->getOperand(0);
unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
@@ -11773,6 +11795,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::TRUNCATE: {
+ if (IsVec)
+ return std::nullopt;
+
uint64_t NarrowByteWidth = BitWidth / 8;
if (NarrowByteWidth >= Index) {
@@ -11815,9 +11840,13 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
}
- case ISD::BSWAP:
+ case ISD::BSWAP: {
+ if (IsVec)
+ return std::nullopt;
+
return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
Depth + 1, StartingIndex);
+ }
case ISD::EXTRACT_VECTOR_ELT: {
auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -11834,6 +11863,9 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case AMDGPUISD::PERM: {
+ if (IsVec)
+ return std::nullopt;
+
auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
if (!PermMask)
return std::nullopt;
@@ -11930,25 +11962,55 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
unsigned DWordOffset) {
SDValue Ret;
- if (Src.getValueSizeInBits() <= 32)
- return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
- if (Src.getValueSizeInBits() >= 256) {
- assert(!(Src.getValueSizeInBits() % 32));
- Ret = DAG.getBitcast(
- MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
- DAG.getConstant(DWordOffset, SL, MVT::i32));
- }
+ auto TypeSize = Src.getValueSizeInBits().getFixedValue();
+ // ByteProvider must be at least 8 bits
+ assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
- Ret = DAG.getBitcastedAnyExtOrTrunc(
- Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
- if (DWordOffset) {
- auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
- DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
- return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
- }
+ if (TypeSize <= 32)
+ return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
+ if (Src.getValueType().isVector()) {
+ auto ScalarTySize = Src.getScalarValueSizeInBits();
+ auto ScalarTy = Src.getValueType().getScalarType();
+ if (ScalarTySize == 32) {
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
+ DAG.getConstant(DWordOffset, SL, MVT::i32));
+ }
+ if (ScalarTySize > 32) {
+ Ret = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
+ DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
+ auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
+ if (ShiftVal)
+ Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
+ DAG.getConstant(ShiftVal, SL, MVT::i32));
+ return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+ }
+
+ assert(ScalarTySize < 32);
+ auto NumElements = TypeSize / ScalarTySize;
+ auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
+ auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
+ auto NumElementsIn32 = 32 / ScalarTySize;
+ auto NumAvailElements = DWordOffset < Trunc32Elements
+ ? NumElementsIn32
+ : NumElements - NormalizedTrunc;
+
+ SmallVector<SDValue, 4> VecSrcs;
+ DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
+ NumAvailElements);
+
+ Ret = DAG.getBuildVector(
+ MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
+ VecSrcs);
+ return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+ }
+
+ /// Scalar Type
+ auto ShiftVal = 32 * DWordOffset;
+ Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
+ DAG.getConstant(ShiftVal, SL, MVT::i32));
return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
}
@@ -12017,13 +12079,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
}
- SDValue OtherOp =
- SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
+ SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
- if (SecondSrc)
+ if (SecondSrc) {
OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
-
- assert(Op.getValueSizeInBits() == 32);
+ assert(OtherOp.getValueSizeInBits() == 32);
+ }
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a8d53856c7c616..8ac332197215f5 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3409,21 +3409,21 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_hilo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: global_load_dword v6, v[2:3], off
+; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3060505
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3060505
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: extract_hilo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[2:3], off
+; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX9-NEXT: s_mov_b32 s4, 0x3060505
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3451,21 +3451,21 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_lohi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off
-; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v7, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x70404
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x70404
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: extract_lohi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off
-; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v7, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0x70404
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3493,21 +3493,21 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_hihi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2070505
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x2070505
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: extract_hihi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
; GFX9-NEXT: s_mov_b32 s4, 0x2070505
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3664,3 +3664,239 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}
+
+; Should not result in crash
+define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v6i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6
+; GFX10-NEXT: global_load_ushort v3, v[0:1], off
+; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2
+; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: global_store_dword v[6:7], v1, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v6i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6
+; GFX9-NEXT: global_load_ushort v3, v[0:1], off
+; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
+; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v3
+; GFX9-NEXT: global_store_dword v[4:5], v1, off
+; GFX9-NEXT: global_store_dword v[6:7], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2
+ %el0 = extractelement <6 x i16> %vec, i32 0
+ %el1 = extractelement <6 x i16> %vec, i32 1
+ %el2 = extractelement <6 x i16> %vec, i32 2
+ %el3 = extractelement <6 x i16> %vec, i32 3
+ %z0 = zext i16 %el0 to i32
+ %z1 = zext i16 %el1 to i32
+ %s1 = shl nuw i32 %z1, 16
+ %o0 = or i32 %s1, %z0
+ %z2 = zext i16 %el2 to i32
+ %z3 = zext i16 %el3 to i32
+ %s3 = shl nuw i32 %z3, 16
+ %o1 = or i32 %z2, %s3
+
+ store i32 %o0, ptr addrspace(1) %out0, align 4
+ store i32 %o1, ptr addrspace(1) %out1, align 4
+ ret void
+}
+
+
+; Should not result in crash
+define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v7i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: global_store_dword v[6:7], v1, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v7i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: global_store_dword v[6:7], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2
+ %el0 = extractelement <7 x i16> %vec, i32 0
+ %el1 = extractelement <7 x i16> %vec, i32 1
+ %el2 = extractelement <7 x i16> %vec, i32 2
+ %el3 = extractelement <7 x i16> %vec, i32 3
+ %z0 = zext i16 %el0 to i32
+ %z1 = zext i16 %el1 to i32
+ %s1 = shl nuw i32 %z1, 16
+ %o0 = or i32 %s1, %z0
+ %z2 = zext i16 %el2 to i32
+ %z3 = zext i16 %el3 to i32
+ %s3 = shl nuw i32 %z3, 16
+ %o1 = or i32 %z2, %s3
+
+ store i32 %o0, ptr addrspace(1) %out0, align 4
+ store i32 %o1, ptr addrspace(1) %out1, align 4
+ ret void
+}
+
+; Should not result in crash
+define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v13i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_bfe_u32 v0, v2, 8, 8
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: global_store_dword v[6:7], v1, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v13i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
+; GFX9-NEXT: s_mov_b32 s4, 0x5040c00
+; GFX9-NEXT: s_mov_b32 s5, 0x5040c03
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_bfe_u32 v0, v2, 8, 8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8
+; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v3, s5
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: global_store_dword v[6:7], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
+ %el0 = extractelement <13 x i8> %vec, i32 0
+ %el1 = extractelement <13 x i8> %vec, i32 1
+ %el2 = extractelement <13 x i8> %vec, i32 7
+ %el3 = extractelement <13 x i8> %vec, i32 8
+ %z0 = zext i8 %el0 to i32
+ %z1 = zext i8 %el1 to i32
+ %s1 = shl nuw i32 %z1, 16
+ %o0 = or i32 %s1, %z0
+ %z2 = zext i8 %el2 to i32
+ %z3 = zext i8 %el3 to i32
+ %s3 = shl nuw i32 %z3, 16
+ %o1 = or i32 %z2, %s3
+
+ store i32 %o0, ptr addrspace(1) %out0, align 4
+ store i32 %o1, ptr addrspace(1) %out1, align 4
+ ret void
+}
+
+; Should not result in crash
+define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v13i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_perm_b32 v0, v9, v8, 0x3020504
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v1, v11, v12, 0x1000706
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: global_store_dword v[6:7], v1, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v13i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b32 s4, 0x3020504
+; GFX9-NEXT: s_mov_b32 s5, 0x1000706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_perm_b32 v0, v9, v8, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v1, v11, v12, s5
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: global_store_dword v[6:7], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2
+ %el0 = extractelement <13 x i64> %vec, i32 0
+ %el1 = extractelement <13 x i64> %vec, i32 1
+ %el2 = extractelement <13 x i64> %vec, i32 7
+ %el3 = extractelement <13 x i64> %vec, i32 8
+ %el00 = lshr i64 %el0, 32
+ %t0 = trunc i64 %el00 to i16
+ %z0 = zext i16 %t0 to i32
+ %z1 = trunc i64 %el1 to i32
+ %s1 = shl nuw i32 %z1, 16
+ %o0 = or i32 %s1, %z0
+ %t2 = trunc i64 %el2 to i16
+ %z2 = zext i16 %t2 to i32
+ %z3 = trunc i64 %el3 to i32
+ %s3 = shl nuw i32 %z3, 16
+ %o1 = or i32 %z2, %s3
+
+ store i32 %o0, ptr addrspace(1) %out0, align 4
+ store i32 %o1, ptr addrspace(1) %out1, align 4
+ ret void
+}
+
+
+; Should combine the lower 16 bits from each i32 in load
+define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: trunc_vector:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[4:5], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: trunc_vector:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v[0:1], off
+; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec = load <2 x i32>, ptr addrspace(1) %in0, align 2
+ %tvec = trunc <2 x i32> %vec to <2 x i16>
+ %el0 = extractelement <2 x i16> %tvec, i32 0
+ %el1 = extractelement <2 x i16> %tvec, i32 1
+ %z0 = zext i16 %el0 to i32
+ %z1 = zext i16 %el1 to i32
+ %s1 = shl nuw i32 %z1, 16
+ %o0 = or i32 %s1, %z0
+
+ store i32 %o0, ptr addrspace(1) %out0, align 4
+ ret void
+}
More information about the llvm-commits
mailing list