[llvm] c82ebfb - Revert "[AMDGPU] Accept arbitrary sized sources in CalculateByteProvider"
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 25 03:11:36 PDT 2023
Author: Jay Foad
Date: 2023-10-25T11:11:27+01:00
New Revision: c82ebfb97ad1730bb7a3e29b7d4f33dec6226872
URL: https://github.com/llvm/llvm-project/commit/c82ebfb97ad1730bb7a3e29b7d4f33dec6226872
DIFF: https://github.com/llvm/llvm-project/commit/c82ebfb97ad1730bb7a3e29b7d4f33dec6226872.diff
LOG: Revert "[AMDGPU] Accept arbitrary sized sources in CalculateByteProvider"
This reverts commit ef33659492325de7871c8c85e35bd9c1c37f7347.
It was causing incorrect codegen for some Vulkan CTS tests.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/idot4u.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/load-hi16.ll
llvm/test/CodeGen/AMDGPU/permute.ll
llvm/test/CodeGen/AMDGPU/permute_i8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c97486437ed83f1..ff5d0e27277267b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10834,7 +10834,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;
- if (Op.getValueSizeInBits() < 8)
+ auto ValueSize = Op.getValueSizeInBits();
+ if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
return std::nullopt;
switch (Op->getOpcode()) {
@@ -11125,6 +11126,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
auto VecIdx = IdxOp->getZExtValue();
auto ScalarSize = Op.getScalarValueSizeInBits();
if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
}
@@ -11210,6 +11213,9 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
+ assert(Op.getValueType().isByteSized());
+ assert(OtherOp.getValueType().isByteSized());
+
auto TempOp = peekThroughBitcasts(Op);
auto TempOtherOp = peekThroughBitcasts(OtherOp);
@@ -11227,38 +11233,15 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
}
-static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
- unsigned DWordOffset) {
- SDValue Ret;
- if (Src.getValueSizeInBits() <= 32)
- return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
-
- if (Src.getValueSizeInBits() >= 256) {
- assert(!(Src.getValueSizeInBits() % 32));
- Ret = DAG.getBitcast(
- MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
- DAG.getConstant(DWordOffset, SL, MVT::i32));
- }
-
- Ret = DAG.getBitcastedAnyExtOrTrunc(
- Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
- if (DWordOffset) {
- auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
- DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
- return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
- }
-
- return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
-}
-
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- [[maybe_unused]] EVT VT = N->getValueType(0);
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
// VT is known to be MVT::i32, so we need to provide 4 bytes.
- assert(VT == MVT::i32);
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
for (int i = 0; i < 4; i++) {
// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =
@@ -11272,8 +11255,8 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
if (PermNodes.size() != 4)
return SDValue();
- std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
- std::optional<std::pair<unsigned, unsigned>> SecondSrc;
+ int FirstSrc = 0;
+ std::optional<int> SecondSrc;
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
@@ -11281,31 +11264,33 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// by sizeof(Src2) = 4
int SrcByteAdjust = 4;
- // If the Src uses a byte from a
diff erent DWORD, then it corresponds
- // with a
diff erence source
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
- ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
- if (SecondSrc)
- if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
- ((PermOp.SrcOffset / 4) != SecondSrc->second))
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
+ if (SecondSrc.has_value())
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
return SDValue();
// Set the index of the second distinct Src node
- SecondSrc = {i, PermNodes[i].SrcOffset / 4};
- assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
+ SecondSrc = i;
+ assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
SrcByteAdjust = 0;
}
- assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());
- PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}
- SDLoc DL(N);
- SDValue Op = *PermNodes[FirstSrc.first].Src;
- Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
- assert(Op.getValueSizeInBits() == 32);
+
+ SDValue Op = *PermNodes[FirstSrc].Src;
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
+ : *PermNodes[FirstSrc].Src;
+
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
// Check that we are not just extracting the bytes in order from an op
- if (!SecondSrc) {
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
@@ -11317,16 +11302,8 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
}
- SDValue OtherOp =
- SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
-
- if (SecondSrc)
- OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
-
- assert(Op.getValueSizeInBits() == 32);
-
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
-
+ SDLoc DL(N);
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
@@ -11341,6 +11318,7 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(PermMask, DL, MVT::i32));
}
+
return SDValue();
}
@@ -12816,24 +12794,17 @@ static unsigned addPermMasks(unsigned First, unsigned Second) {
return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
}
-struct DotSrc {
- SDValue SrcOp;
- int64_t PermMask;
- int64_t DWordOffset;
-};
-
static void placeSources(ByteProvider<SDValue> &Src0,
ByteProvider<SDValue> &Src1,
- SmallVectorImpl<DotSrc> &Src0s,
- SmallVectorImpl<DotSrc> &Src1s, int Step) {
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
+ int Step) {
assert(Src0.Src.has_value() && Src1.Src.has_value());
// Src0s and Src1s are empty, just place arbitrarily.
if (Step == 0) {
- Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
- Src0.SrcOffset / 4});
- Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
- Src1.SrcOffset / 4});
+ Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
+ Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
return;
}
@@ -12846,38 +12817,38 @@ static void placeSources(ByteProvider<SDValue> &Src0,
unsigned FMask = 0xFF << (8 * (3 - Step));
unsigned FirstMask =
- (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
unsigned SecondMask =
- (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
// Attempt to find Src vector which contains our SDValue, if so, add our
// perm mask to the existing one. If we are unable to find a match for the
// first SDValue, attempt to find match for the second.
int FirstGroup = -1;
for (int I = 0; I < 2; I++) {
- SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
- auto MatchesFirst = [&BPP](DotSrc &IterElt) {
- return IterElt.SrcOp == *BPP.first.Src &&
- (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ I == 0 ? Src0s : Src1s;
+ auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.first.Src;
};
auto Match = llvm::find_if(Srcs, MatchesFirst);
if (Match != Srcs.end()) {
- Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
+ Match->second = addPermMasks(FirstMask, Match->second);
FirstGroup = I;
break;
}
}
if (FirstGroup != -1) {
- SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
- auto MatchesSecond = [&BPP](DotSrc &IterElt) {
- return IterElt.SrcOp == *BPP.second.Src &&
- (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ FirstGroup == 1 ? Src0s : Src1s;
+ auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.second.Src;
};
auto Match = llvm::find_if(Srcs, MatchesSecond);
if (Match != Srcs.end()) {
- Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
+ Match->second = addPermMasks(SecondMask, Match->second);
} else
- Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
+ Srcs.push_back({*BPP.second.Src, SecondMask});
return;
}
}
@@ -12889,32 +12860,29 @@ static void placeSources(ByteProvider<SDValue> &Src0,
unsigned FMask = 0xFF << (8 * (3 - Step));
Src0s.push_back(
- {*Src0.Src,
- ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
- Src1.SrcOffset / 4});
+ {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
Src1s.push_back(
- {*Src1.Src,
- ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
- Src1.SrcOffset / 4});
+ {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
return;
}
-static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
- SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
- bool IsAny) {
+static SDValue
+resolveSources(SelectionDAG &DAG, SDLoc SL,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ bool IsSigned, bool IsAny) {
// If we just have one source, just permute it accordingly.
if (Srcs.size() == 1) {
auto Elt = Srcs.begin();
- auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
+ auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
- // v_perm will produce the original value
- if (Elt->PermMask == 0x3020100)
- return EltOp;
+ // v_perm will produce the original value.
+ if (Elt->second == 0x3020100)
+ return EltVal;
- return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
- DAG.getConstant(Elt->PermMask, SL, MVT::i32));
+ return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(Elt->second, SL, MVT::i32));
}
auto FirstElt = Srcs.begin();
@@ -12925,8 +12893,8 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
// If we have multiple sources in the chain, combine them via perms (using
// calculated perm mask) and Ors.
while (true) {
- auto FirstMask = FirstElt->PermMask;
- auto SecondMask = SecondElt->PermMask;
+ auto FirstMask = FirstElt->second;
+ auto SecondMask = SecondElt->second;
unsigned FirstCs = FirstMask & 0x0c0c0c0c;
unsigned FirstPlusFour = FirstMask | 0x04040404;
@@ -12936,9 +12904,9 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
auto PermMask = addPermMasks(FirstMask, SecondMask);
auto FirstVal =
- getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
auto SecondVal =
- getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
+ DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
SecondVal,
@@ -12952,12 +12920,12 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
// If we only have a FirstElt, then just combine that into the cumulative
// source node.
if (SecondElt == Srcs.end()) {
- auto EltOp =
- getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
+ auto EltVal =
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
Perms.push_back(
- DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
- DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
+ DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(FirstElt->second, SL, MVT::i32)));
break;
}
}
@@ -12968,8 +12936,9 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
: Perms[0];
}
-static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
- for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
+static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ unsigned ChainLength) {
+ for (auto &[EntryVal, EntryMask] : Srcs) {
EntryMask = EntryMask >> ((4 - ChainLength) * 8);
auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
EntryMask += ZeroMask;
@@ -13034,8 +13003,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
SDValue TempNode(N, 0);
std::optional<bool> IsSigned;
- SmallVector<DotSrc, 4> Src0s;
- SmallVector<DotSrc, 4> Src1s;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
SmallVector<SDValue, 4> Src2s;
// Match the v_dot4 tree, while collecting src nodes.
@@ -13113,11 +13082,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
// (commutation).
bool UseOriginalSrc = false;
if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
- Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
- Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
- Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
+ Src0s.begin()->second == Src1s.begin()->second &&
+ Src0s.begin()->first.getValueSizeInBits() == 32 &&
+ Src1s.begin()->first.getValueSizeInBits() == 32) {
SmallVector<unsigned, 4> SrcBytes;
- auto Src0Mask = Src0s.begin()->PermMask;
+ auto Src0Mask = Src0s.begin()->second;
SrcBytes.push_back(Src0Mask & 0xFF000000);
bool UniqueEntries = true;
for (auto I = 1; I < 4; I++) {
@@ -13132,19 +13101,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
if (UniqueEntries) {
UseOriginalSrc = true;
-
- auto FirstElt = Src0s.begin();
- auto FirstEltOp =
- getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
-
- auto SecondElt = Src1s.begin();
- auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
- SecondElt->DWordOffset);
-
- Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
- MVT::getIntegerVT(32));
- Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
- MVT::getIntegerVT(32));
+ // Must be 32 bits to enter above conditional.
+ assert(Src0s.begin()->first.getValueSizeInBits() == 32);
+ assert(Src1s.begin()->first.getValueSizeInBits() == 32);
+ Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
+ Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index e6b6a0bedd72e99..a82c5215f3b2c65 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -4622,1091 +4622,4 @@ entry:
ret void
}
-define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_hilo:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0
-; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 8
-; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_hilo:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_hilo:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
-; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_hilo:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_hilo:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_hilo:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-
- %v1e0 = extractelement <8 x i8> %vec1, i64 4
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec2, i64 0
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <8 x i8> %vec1, i64 5
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec2, i64 1
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <8 x i8> %vec1, i64 6
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec2, i64 2
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <8 x i8> %vec1, i64 7
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec2, i64 3
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_lohi:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
-; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_lohi:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
-; GFX8-NEXT: v_bfe_u32 v8, v2, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v3, v3, v7, v4
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX8-NEXT: v_mad_u32_u24 v3, v5, v8, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v6, v2, v3
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_lohi:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
-; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_lohi:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302
-; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_lohi:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x10302
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_lohi:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x10302
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-
- %v1e0 = extractelement <8 x i8> %vec1, i64 0
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec2, i64 7
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <8 x i8> %vec1, i64 1
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec2, i64 6
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <8 x i8> %vec1, i64 2
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec2, i64 5
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <8 x i8> %vec1, i64 3
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec2, i64 4
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_hihi:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8
-; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
-; GFX7-NEXT: v_bfe_u32 v5, v0, 16, 8
-; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_hihi:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
-; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8
-; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_hihi:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 16, 8
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5
-; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_hihi:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200
-; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_hihi:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4
-; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x1030200
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_hihi:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:4
-; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x1030200
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-
- %v1e0 = extractelement <8 x i8> %vec1, i64 4
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec2, i64 6
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <8 x i8> %vec1, i64 6
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec2, i64 4
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <8 x i8> %vec1, i64 5
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec2, i64 7
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <8 x i8> %vec1, i64 7
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec2, i64 5
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_v8i8:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8
-; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
-; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
-; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
-; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_v8i8:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 8
-; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_v8i8:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0
-; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v3, v4, v5
-; GFX9-NODL-NEXT: v_add3_u32 v0, v1, v6, v0
-; GFX9-NODL-NEXT: global_store_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_v8i8:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_v8i8:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_v8i8:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
-
-
- %v1e0 = extractelement <8 x i8> %vec1, i64 0
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec1, i64 4
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <8 x i8> %vec1, i64 1
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec1, i64 5
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <8 x i8> %vec1, i64 2
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec1, i64 6
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <8 x i8> %vec1, i64 3
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec1, i64 7
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_v16i8:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
-; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64
-; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX7-NEXT: v_mul_u32_u24_e32 v2, v2, v5
-; GFX7-NEXT: v_bfe_u32 v6, v3, 8, 8
-; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_v16i8:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2]
-; GFX8-NEXT: flat_load_dword v4, v[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
-; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 8
-; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 8
-; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_v16i8:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NODL-NEXT: ; kill: killed $vgpr5
-; GFX9-NODL-NEXT: ; kill: killed $vgpr4
-; GFX9-NODL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v0, v5, s[6:7]
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v0
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0
-; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_v16i8:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT: ; kill: killed $vgpr5
-; GFX9-DL-NEXT: ; kill: killed $vgpr4
-; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
-; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0
-; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_v16i8:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: ; kill: killed $vgpr5
-; GFX10-DL-NEXT: ; kill: killed $vgpr4
-; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7]
-; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_v16i8:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7]
-; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002
-; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-
- %v1e0 = extractelement <16 x i8> %vec1, i64 8
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec2, i64 0
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <16 x i8> %vec1, i64 10
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec2, i64 1
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <16 x i8> %vec1, i64 13
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec2, i64 2
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <16 x i8> %vec1, i64 15
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec2, i64 3
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
-; GFX7-LABEL: idot4_acc32_v256i8:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, s3
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 offset:252
-; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[8:11], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_u32 v4, v0, 16, 8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
-; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4
-; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1
-; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: idot4_acc32_v256i8:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_movk_i32 s2, 0xfc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2
-; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2
-; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7
-; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3
-; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: idot4_acc32_v256i8:
-; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v2, v1, s[4:5] offset:252
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7]
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v2
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v3
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: idot4_acc32_v256i8:
-; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252
-; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1
-; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: idot4_acc32_v256i8:
-; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252
-; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0x3020001
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: idot4_acc32_v256i8:
-; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252
-; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x3020001
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: s_nop 0
-; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-DL-NEXT: s_endpgm
- ptr addrspace(1) %src2,
- ptr addrspace(1) nocapture %dst) {
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <256 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
-
- %v1e0 = extractelement <256 x i8> %vec1, i64 255
- %cv1e0 = zext i8 %v1e0 to i32
- %v2e0 = extractelement <8 x i8> %vec2, i64 0
- %cv2e0 = zext i8 %v2e0 to i32
- %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
-
- %v1e1 = extractelement <256 x i8> %vec1, i64 254
- %cv1e1 = zext i8 %v1e1 to i32
- %v2e1 = extractelement <8 x i8> %vec2, i64 1
- %cv2e1 = zext i8 %v2e1 to i32
- %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
-
- %v1e2 = extractelement <256 x i8> %vec1, i64 252
- %cv1e2 = zext i8 %v1e2 to i32
- %v2e2 = extractelement <8 x i8> %vec2, i64 2
- %cv2e2 = zext i8 %v2e2 to i32
- %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
-
- %v1e3 = extractelement <256 x i8> %vec1, i64 253
- %cv1e3 = zext i8 %v1e3 to i32
- %v2e3 = extractelement <8 x i8> %vec2, i64 3
- %cv2e3 = zext i8 %v2e3 to i32
- %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
-
- %add1 = add i32 %mul1, 0
- %add2 = add i32 %add1, %mul2
- %add3 = add i32 %add2, %mul3
- %add4 = add i32 %add3, %mul4
- store i32 %add4, ptr addrspace(1) %dst, align 4
- ret void
-}
-
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 3daa88a7474d36d..f98b41ba199bd7f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2226,12 +2226,14 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: s_lshl_b32 s0, s4, 16
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: s_lshl_b32 s1, s4, 16
+; VI-NEXT: s_mov_b32 s2, 0xffff
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfi_b32 v3, s2, v3, v3
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -2306,13 +2308,14 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v6, s4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_bfi_b32 v3, s0, v6, v3
+; VI-NEXT: v_bfi_b32 v3, s2, v6, v3
+; VI-NEXT: v_bfi_b32 v1, s2, v1, v1
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 26a1716db20271a..ba025a2202313fb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
index 7fb450a72ff746b..c6671fa51795439 100644
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -111,21 +111,19 @@ bb:
ret void
}
-; FIXME: produce v_alignbit_b32 v2, v2, s0, 24 instead of v_perm
define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GCN-LABEL: lsh8_or_lsr24:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_perm_b32 v2, s0, v2, v3
+; GCN-NEXT: v_alignbit_b32 v2, v2, s0, 24
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a8d53856c7c6161..5f896f92de0f424 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3404,263 +3404,3 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}
-
-define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_hilo:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dword v7, v[2:3], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3060505
-; GFX10-NEXT: global_store_dword v[4:5], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_hilo:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v7, v[2:3], off
-; GFX9-NEXT: s_mov_b32 s4, 0x3060505
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
- %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
- %v1e5 = extractelement <8 x i8> %vec1, i64 5
- %zv1e5 = zext i8 %v1e5 to i32
- %byte1 = shl i32 %zv1e5, 8
-
- %v1e6 = extractelement <8 x i8> %vec1, i64 6
- %zv1e6 = zext i8 %v1e6 to i32
- %byte2 = shl i32 %zv1e6, 16
- %v2e3 = extractelement <8 x i8> %vec2, i64 3
- %zv2e3 = zext i8 %v2e3 to i32
- %byte3 = shl i32 %zv2e3, 24
-
- %tmp0 = or i32 %zv1e5, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
-
-define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_lohi:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off
-; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x70404
-; GFX10-NEXT: global_store_dword v[4:5], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_lohi:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off
-; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
-; GFX9-NEXT: s_mov_b32 s4, 0x70404
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
- %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
- %v1e0 = extractelement <8 x i8> %vec1, i64 0
- %zv1e0 = zext i8 %v1e0 to i32
- %byte1 = shl i32 %zv1e0, 8
-
- %v1e3 = extractelement <8 x i8> %vec1, i64 3
- %zv1e3 = zext i8 %v1e3 to i32
- %byte2 = shl i32 %zv1e3, 16
- %v2e4 = extractelement <8 x i8> %vec2, i64 4
- %zv2e4 = zext i8 %v2e4 to i32
- %byte3 = shl i32 %zv2e4, 24
-
- %tmp0 = or i32 %zv1e0, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
-
-define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_hihi:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2070505
-; GFX10-NEXT: global_store_dword v[4:5], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_hihi:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4
-; GFX9-NEXT: s_mov_b32 s4, 0x2070505
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
- %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
- %v1e5 = extractelement <8 x i8> %vec1, i64 5
- %zv1e5 = zext i8 %v1e5 to i32
- %byte1 = shl i32 %zv1e5, 8
-
- %v1e7 = extractelement <8 x i8> %vec1, i64 7
- %zv1e7 = zext i8 %v1e7 to i32
- %byte2 = shl i32 %zv1e7, 16
- %v2e6 = extractelement <8 x i8> %vec2, i64 6
- %zv2e6 = zext i8 %v2e6 to i32
- %byte3 = shl i32 %zv2e6, 24
-
- %tmp0 = or i32 %zv1e5, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
-
-define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_v8i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404
-; GFX10-NEXT: global_store_dword v[2:3], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_v8i8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0x1070404
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: global_store_dword v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
- %v1e4 = extractelement <8 x i8> %vec1, i64 4
- %zv1e4 = zext i8 %v1e4 to i32
- %byte1 = shl i32 %zv1e4, 8
-
- %v1e7 = extractelement <8 x i8> %vec1, i64 7
- %zv1e7 = zext i8 %v1e7 to i32
- %byte2 = shl i32 %zv1e7, 16
- %v2e1 = extractelement <8 x i8> %vec1, i64 1
- %zv2e1 = zext i8 %v2e1 to i32
- %byte3 = shl i32 %zv2e1, 24
-
- %tmp0 = or i32 %zv1e4, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
-
-define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_v256i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:252
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6050707
-; GFX10-NEXT: global_store_dword v[2:3], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_v256i8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:252
-; GFX9-NEXT: s_mov_b32 s4, 0x6050707
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT: global_store_dword v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4
- %v1e4 = extractelement <256 x i8> %vec1, i64 255
- %zv1e4 = zext i8 %v1e4 to i32
- %byte1 = shl i32 %zv1e4, 8
-
- %v1e7 = extractelement <256 x i8> %vec1, i64 253
- %zv1e7 = zext i8 %v1e7 to i32
- %byte2 = shl i32 %zv1e7, 16
- %v2e1 = extractelement <256 x i8> %vec1, i64 254
- %zv2e1 = zext i8 %v2e1 to i32
- %byte3 = shl i32 %zv2e1, 24
-
- %tmp0 = or i32 %zv1e4, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
-
-; TODO : support this pattern
-define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
-; GFX10-LABEL: extract_3src:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
-; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
-; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2
-; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
-; GFX10-NEXT: global_store_dword v[4:5], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: extract_3src:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
-; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
- %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
- %v1e0 = extractelement <8 x i8> %vec1, i64 0
- %zv1e0 = zext i8 %v1e0 to i32
- %byte1 = shl i32 %zv1e0, 8
-
- %v1e5 = extractelement <8 x i8> %vec1, i64 5
- %zv1e5 = zext i8 %v1e5 to i32
- %byte2 = shl i32 %zv1e5, 16
- %v2e6 = extractelement <8 x i8> %vec2, i64 6
- %zv2e6 = zext i8 %v2e6 to i32
- %byte3 = shl i32 %zv2e6, 24
-
- %tmp0 = or i32 %zv1e0, %byte1
- %tmp1 = or i32 %tmp0, %byte2
- %res = or i32 %tmp1, %byte3
- store i32 %res, ptr addrspace(1) %out0, align 4
- ret void
-}
More information about the llvm-commits
mailing list