[llvm] [AMDGPU]: Accept constant zero bytes in v_perm OrCombine (PR #66533)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 26 12:45:18 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/66533
>From 0b201d6ffcb003716cd7858011cf302c384a93e9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Sep 2023 10:18:05 -0700
Subject: [PATCH 1/2] [AMDGPU]: Accept constant zero bytes in v_perm OrCombine
Change-Id: I454ccee1e33867359ae8053464a2ca57a669d73f
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 103 ++++++++++++++++++---
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 80 ++++++++---------
llvm/test/CodeGen/AMDGPU/load-hi16.ll | 104 +++++++++++-----------
llvm/test/CodeGen/AMDGPU/load-lo16.ll | 60 ++++++-------
llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 18 ++--
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 60 ++++++-------
llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 18 ++--
7 files changed, 255 insertions(+), 188 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a64a9e608f2173..fc6386a0e13ecf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11612,6 +11612,25 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+
+ assert((ScalarSize >= 8) && !(ScalarSize % 8));
+
+ if (ScalarSize < 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+
+ SrcIndex = VecIdx * ScalarSize / 8 + SrcIndex;
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+ // Just use the scalar
+ return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+ }
default: {
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
@@ -11640,6 +11659,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
bool IsVec = Op.getValueType().isVector();
+
switch (Op.getOpcode()) {
case ISD::OR: {
if (IsVec)
@@ -11922,6 +11942,9 @@ static bool addresses16Bits(int Mask) {
int Low8 = Mask & 0xff;
int Hi8 = (Mask & 0xff00) >> 8;
+ if (Low8 == 0x0c || Hi8 == 0x0c)
+ return false;
+
assert(Low8 < 8 && Hi8 < 8);
// Are the bytes contiguous in the order of increasing addresses.
bool IsConsecutive = (Hi8 - Low8 == 1);
@@ -11959,6 +11982,28 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
}
+static bool bothAre8Bit(SDValue &Op, SDValue &OtherOp, bool IsNotLegalized) {
+ if (IsNotLegalized)
+ return Op.getValueSizeInBits() == 8 && OtherOp.getValueSizeInBits() == 8;
+
+ for (unsigned I = 1; I < Op.getValueSizeInBits().getFixedValue() / 8; I++) {
+ auto BP = calculateByteProvider(Op, I, 0, I);
+ if (BP && !BP->isConstantZero())
+ return false;
+ }
+
+ if (Op == OtherOp)
+ return true;
+
+ for (unsigned I = 1; I < OtherOp.getValueSizeInBits().getFixedValue() / 8;
+ I++) {
+ auto BP = calculateByteProvider(OtherOp, I, 0, I);
+ if (BP && !BP->isConstantZero())
+ return false;
+ }
+ return true;
+}
+
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
unsigned DWordOffset) {
SDValue Ret;
@@ -12016,17 +12061,16 @@ static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- [[maybe_unused]] EVT VT = N->getValueType(0);
SmallVector<ByteProvider<SDValue>, 8> PermNodes;
// VT is known to be MVT::i32, so we need to provide 4 bytes.
- assert(VT == MVT::i32);
+ assert(N->getValueType(0) == MVT::i32);
+
for (int i = 0; i < 4; i++) {
// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
- // TODO support constantZero
- if (!P || P->isConstantZero())
+ if (!P)
return SDValue();
PermNodes.push_back(*P);
@@ -12039,6 +12083,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
+ if (PermOp.isConstantZero()) {
+ if (FirstSrc.first == i)
+ ++FirstSrc.first;
+ PermMask |= 0x0c << (i * 8);
+ continue;
+ }
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
// by sizeof(Src2) = 4
int SrcByteAdjust = 4;
@@ -12062,10 +12112,14 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
}
SDLoc DL(N);
+ if (PermMask == 0x0c0c0c0c)
+ return DAG.getConstant(0, DL, MVT::i32);
+
SDValue Op = *PermNodes[FirstSrc.first].Src;
Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
assert(Op.getValueSizeInBits() == 32);
+ SDValue OtherOp;
// Check that we are not just extracting the bytes in order from an op
if (!SecondSrc) {
int Low16 = PermMask & 0xffff;
@@ -12077,17 +12131,21 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// The perm op would really just produce Op. So combine into Op
if (WellFormedLow && WellFormedHi)
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
- }
- SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
+ OtherOp = Op;
+ }
if (SecondSrc) {
- OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
+ OtherOp = getDWordFromOffset(DAG, DL, *PermNodes[SecondSrc->first].Src,
+ SecondSrc->second);
assert(OtherOp.getValueSizeInBits() == 32);
}
- if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
-
+ bool IsGFX9Plus =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >=
+ AMDGPUSubtarget::GFX9;
+ if (!IsGFX9Plus || (hasNon16BitAccesses(PermMask, Op, OtherOp) &&
+ (!bothAre8Bit(Op, OtherOp, DCI.isBeforeLegalize())))) {
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
@@ -12159,12 +12217,33 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// If all the uses of an or need to extract the individual elements, do not
// attempt to lower into v_perm
auto usesCombinedOperand = [](SDNode *OrUse) {
+ // The combined bytes seem to be getting extracted
+ if (OrUse->getOpcode() == ISD::SRL || OrUse->getOpcode() == ISD::TRUNCATE)
+ return false;
+
+ if (OrUse->getOpcode() == ISD::AND) {
+ auto SelectMask = dyn_cast<ConstantSDNode>(OrUse->getOperand(1));
+ if (SelectMask && (SelectMask->getZExtValue() == 0xFF))
+ return false;
+ }
+
+ if (OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE0 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE1 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE2 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE3) {
+ return false;
+ }
+
+ if (auto StoreUse = dyn_cast<StoreSDNode>(OrUse))
+ if (StoreUse->isTruncatingStore() &&
+ StoreUse->getMemoryVT().getSizeInBits() == 8)
+ return false;
+
// If we have any non-vectorized use, then it is a candidate for v_perm
- if (OrUse->getOpcode() != ISD::BITCAST ||
- !OrUse->getValueType(0).isVector())
+ if (!(OrUse->getValueType(0).isVector() &&
+ OrUse->getOpcode() != ISD::BUILD_VECTOR))
return true;
- // If we have any non-vectorized use, then it is a candidate for v_perm
for (auto VUse : OrUse->uses()) {
if (!VUse->getValueType(0).isVector())
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index e157c69dff3665..79aa1610e9ae09 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1428,7 +1428,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s8, 0x4000405
+; VI-NEXT: s_mov_b32 s8, 0xc0c0004
+; VI-NEXT: s_mov_b32 s9, 0x4000405
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
@@ -1438,35 +1439,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ubyte v6, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4
+; VI-NEXT: flat_load_ubyte v7, v[2:3]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[2:3]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
+; VI-NEXT: v_perm_b32 v3, v7, v6, s8
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_perm_b32 v0, v1, v0, s8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
+; VI-NEXT: v_perm_b32 v4, v3, v0, s9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v3
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_perm_b32 v4, v4, v5, s8
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -1794,43 +1791,46 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_mov_b32 s4, 0xc0c0004
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v10, v[2:3]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0
-; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
+; VI-NEXT: flat_load_ubyte v9, v[2:3]
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: flat_load_ubyte v7, v[8:9]
-; VI-NEXT: flat_load_ubyte v8, v[2:3]
-; VI-NEXT: flat_load_ubyte v2, v[0:1]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v9, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_perm_b32 v7, v8, v9, s4
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_perm_b32 v1, v6, v10, s4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v3, v1
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 0c61c58ef06192..37dd0da4685062 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -39,12 +39,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -106,12 +106,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias %
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -173,12 +173,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(ptr addrspace(3) noalias
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v3, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v0
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v3, v0, s4
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -491,9 +491,9 @@ define void @load_local_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %re
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -798,9 +798,9 @@ define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1102,9 +1102,9 @@ define void @load_flat_hi_v2i16_reglo_vreg_zexti8(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1496,9 +1496,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8)
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1699,8 +1699,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2196,9 +2196,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr add
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
+; GFX803-NEXT: s_mov_b32 s4, 0xc000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2255,9 +2255,9 @@ define <2 x i16> @load_local_v2i16_split_multi_chain(ptr addrspace(3) %in) #0 {
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:2
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain:
@@ -2305,10 +2305,9 @@ define <2 x i16> @load_local_lo_hi_v2i16_samechain(ptr addrspace(3) %in) #0 {
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain:
@@ -2408,9 +2407,9 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr a
; GFX803-NEXT: ds_read_u16 v2, v0
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect:
@@ -2466,8 +2465,8 @@ define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_v2i16_split:
@@ -2520,9 +2519,10 @@ define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
-; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: s_waitcnt vmcnt(0)
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: s_waitcnt lgkmcnt(0)
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_v2i16_split:
@@ -2572,9 +2572,9 @@ define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_v2i16_split:
@@ -2625,8 +2625,8 @@ define <2 x i16> @load_private_v2i16_split(ptr addrspace(5) byval(i16) %in) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_v2i16_split:
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 3ef86c13e150ac..299fbccf2bd568 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -56,9 +56,9 @@ define <2 x i16> @load_local_lo_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo:
@@ -105,9 +105,9 @@ define void @load_local_lo_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -314,9 +314,9 @@ define void @load_local_lo_v2i16_reghi_vreg_zexti8(ptr addrspace(3) %in, i32 %re
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -357,9 +357,9 @@ define void @load_local_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %re
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000c04
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -878,9 +878,9 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -964,9 +964,9 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1130,9 +1130,9 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1210,9 +1210,9 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1343,9 +1343,9 @@ define void @load_private_lo_v2i16_reghi_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1590,9 +1590,9 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8)
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1691,8 +1691,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1791,8 +1791,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1927,9 +1927,9 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2163,8 +2163,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2302,8 +2302,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
; GFX803-NEXT: v_mov_b32_e32 v2, 44
; GFX803-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x3020c04
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 994ef22539a65f..01fa8895483b64 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -456,13 +456,12 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: s_mov_b32 s0, 0xc070c05
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT: v_perm_b32 v2, v3, v3, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -639,16 +638,15 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s2, 0xc010c05
+; VI-NEXT: s_mov_b32 s3, 0xc070c05
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_perm_b32 v0, v0, v4, s2
+; VI-NEXT: v_perm_b32 v1, v1, v1, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 8ac332197215f5..1fc3aac7d2d3f5 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3614,34 +3614,28 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
; GFX10-LABEL: extract_3src:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v8
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8
-; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
-; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2
-; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
+; GFX10-NEXT: v_perm_b32 v1, v6, v7, 0xc010404
+; GFX10-NEXT: v_and_or_b32 v0, 0xff000000, v0, v1
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: extract_3src:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xc010404
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
-; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT: v_perm_b32 v1, v6, v7, s4
+; GFX9-NEXT: s_mov_b32 s4, 0xff000000
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3762,34 +3756,34 @@ define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-LABEL: extract_v13i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:10
+; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:8
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_bfe_u32 v0, v2, 8, 8
+; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00
-; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03
-; GFX10-NEXT: global_store_dword v[4:5], v0, off
-; GFX10-NEXT: global_store_dword v[6:7], v1, off
+; GFX10-NEXT: v_perm_b32 v1, v2, v2, 0xc050c04
+; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0xc000c07
+; GFX10-NEXT: global_store_dword v[4:5], v1, off
+; GFX10-NEXT: global_store_dword v[6:7], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: extract_v13i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:10
+; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:8
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
-; GFX9-NEXT: s_mov_b32 s4, 0x5040c00
-; GFX9-NEXT: s_mov_b32 s5, 0x5040c03
+; GFX9-NEXT: s_mov_b32 s4, 0xc050c04
+; GFX9-NEXT: s_mov_b32 s5, 0xc000c07
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfe_u32 v0, v2, 8, 8
+; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v9
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v3, s5
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
-; GFX9-NEXT: global_store_dword v[6:7], v1, off
+; GFX9-NEXT: v_perm_b32 v1, v2, v2, s4
+; GFX9-NEXT: v_perm_b32 v0, v3, v0, s5
+; GFX9-NEXT: global_store_dword v[4:5], v1, off
+; GFX9-NEXT: global_store_dword v[6:7], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index b81af3eb838f1f..ee865811c63586 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -468,14 +468,12 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: s_mov_b32 s0, 0x60c040c
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
-; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_perm_b32 v2, v3, v3, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -652,18 +650,16 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s2, 0x20c000c
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
-; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
+; VI-NEXT: v_perm_b32 v1, v0, v1, s2
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0
-; VI-NEXT: v_or_b32_e32 v1, v1, v4
-; VI-NEXT: v_or_b32_e32 v0, v5, v0
+; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
>From 45aad68d20c1cecc391332b0bb3474b9227a0803 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 26 Feb 2024 12:41:57 -0800
Subject: [PATCH 2/2] fixup! Prefer v_or (v_lshl V0, N * 8) , V1 over v_perm
Change-Id: Ied4adc21dfaa0214c1bb0b8276819d516a2f034e
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 80 +++++++++++------------
llvm/test/CodeGen/AMDGPU/load-hi16.ll | 80 +++++++++++------------
llvm/test/CodeGen/AMDGPU/load-lo16.ll | 12 ++--
4 files changed, 88 insertions(+), 91 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc6386a0e13ecf..e9c5bf31f60d4d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12141,11 +12141,8 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
assert(OtherOp.getValueSizeInBits() == 32);
}
- bool IsGFX9Plus =
- DAG.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >=
- AMDGPUSubtarget::GFX9;
- if (!IsGFX9Plus || (hasNon16BitAccesses(PermMask, Op, OtherOp) &&
- (!bothAre8Bit(Op, OtherOp, DCI.isBeforeLegalize())))) {
+ if (hasNon16BitAccesses(PermMask, Op, OtherOp) &&
+ (!bothAre8Bit(Op, OtherOp, DCI.isBeforeLegalize()))) {
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 79aa1610e9ae09..e157c69dff3665 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1428,8 +1428,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s8, 0xc0c0004
-; VI-NEXT: s_mov_b32 s9, 0x4000405
+; VI-NEXT: s_mov_b32 s8, 0x4000405
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
@@ -1439,31 +1438,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ubyte v6, v[0:1]
-; VI-NEXT: flat_load_ubyte v7, v[2:3]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: flat_load_ubyte v1, v[2:3]
+; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v4, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_perm_b32 v3, v7, v6, s8
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_perm_b32 v0, v1, v0, s8
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
-; VI-NEXT: v_perm_b32 v4, v3, v0, s9
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v3
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; VI-NEXT: v_or_b32_e32 v4, v5, v4
+; VI-NEXT: v_or_b32_e32 v5, v7, v3
; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: v_perm_b32 v4, v4, v5, s8
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -1791,46 +1794,43 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT: s_mov_b32 s4, 0xc0c0004
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
+; VI-NEXT: flat_load_ubyte v10, v[2:3]
+; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
+; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: flat_load_ubyte v8, v[2:3]
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v9, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_perm_b32 v7, v8, v9, s4
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_perm_b32 v1, v6, v10, s4
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v3
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v3, v1
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 37dd0da4685062..73b520be173e16 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -39,12 +39,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -106,12 +106,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias %
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -173,12 +173,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(ptr addrspace(3) noalias
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v3, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v0
-; GFX803-NEXT: v_perm_b32 v0, v3, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2255,9 +2255,9 @@ define <2 x i16> @load_local_v2i16_split_multi_chain(ptr addrspace(3) %in) #0 {
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:2
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain:
@@ -2305,9 +2305,10 @@ define <2 x i16> @load_local_lo_hi_v2i16_samechain(ptr addrspace(3) %in) #0 {
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: s_waitcnt lgkmcnt(1)
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain:
@@ -2407,9 +2408,9 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr a
; GFX803-NEXT: ds_read_u16 v2, v0
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT: v_or_b32_e32 v0, v2, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect:
@@ -2465,8 +2466,8 @@ define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_v2i16_split:
@@ -2519,10 +2520,9 @@ define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
-; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
-; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_v2i16_split:
@@ -2572,9 +2572,9 @@ define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_v2i16_split:
@@ -2625,8 +2625,8 @@ define <2 x i16> @load_private_v2i16_split(ptr addrspace(5) byval(i16) %in) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_v2i16_split:
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
+; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 299fbccf2bd568..0be465737fdf8b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -56,9 +56,9 @@ define <2 x i16> @load_local_lo_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo:
@@ -105,9 +105,9 @@ define void @load_local_lo_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1343,9 +1343,9 @@ define void @load_private_lo_v2i16_reghi_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list