[llvm] 391249d - [AMDGPU] Allow 8,16 bit sources in calculateSrcByte
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 28 09:51:06 PDT 2023
Author: Jeffrey Byrnes
Date: 2023-07-28T09:50:21-07:00
New Revision: 391249d1afe47d1671486a267eaf821a694987ea
URL: https://github.com/llvm/llvm-project/commit/391249d1afe47d1671486a267eaf821a694987ea
DIFF: https://github.com/llvm/llvm-project/commit/391249d1afe47d1671486a267eaf821a694987ea.diff
LOG: [AMDGPU] Allow 8,16 bit sources in calculateSrcByte
This is required for many trees produced in practice for i8 CodeGen.
Differential Revision: https://reviews.llvm.org/D155864
Change-Id: Iac01d183d9998b15138bdc7a5051e3bed338e7d9
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/bf16.ll
llvm/test/CodeGen/AMDGPU/load-hi16.ll
llvm/test/CodeGen/AMDGPU/load-lo16.ll
llvm/test/CodeGen/AMDGPU/permute_i8.ll
llvm/test/CodeGen/AMDGPU/trunc-combine.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d7bf5561a7c384..5509b408eb495c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10428,10 +10428,12 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;
+ auto ValueSize = Op.getValueSizeInBits();
+ if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+ return std::nullopt;
+
switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
- if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
- return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
@@ -10451,9 +10453,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
}
default: {
- if (Op.getScalarValueSizeInBits() != 32)
- return std::nullopt;
-
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
}
@@ -10595,6 +10594,17 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
}
+ case ISD::CopyFromReg: {
+ auto BitWidth = Op.getScalarValueSizeInBits();
+ if (BitWidth % 8)
+ llvm_unreachable("Invalid type in CopyFromReg");
+
+ if (BitWidth / 8 > Index)
+ return calculateSrcByte(Op, StartingIndex, Index);
+
+ return std::nullopt;
+ }
+
case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
@@ -10631,7 +10641,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
// Returns true if the Operand is a scalar and is 16 bits
-static bool is16BitScalarOp(SDValue &Operand) {
+static bool isExtendedFrom16Bits(SDValue &Operand) {
+
switch (Operand.getOpcode()) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
@@ -10647,7 +10658,7 @@ static bool is16BitScalarOp(SDValue &Operand) {
auto MemVT = L->getMemoryVT();
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
}
- return false;
+ return L->getMemoryVT().getSizeInBits() == 16;
}
default:
return false;
@@ -10675,29 +10686,29 @@ static bool addresses16Bits(int Mask) {
// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.
-static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
+static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
- // ByteProvider only accepts 32 bit operands
- assert(Op.getValueType().getSizeInBits() == 32);
- assert(OtherOp.getValueType().getSizeInBits() == 32);
+ assert(Op.getValueType().isByteSized());
+ assert(OtherOp.getValueType().isByteSized());
- auto OpIs16Bit = is16BitScalarOp(Op);
- auto OtherOpIs16Bit = is16BitScalarOp(Op);
+ auto TempOp = peekThroughBitcasts(Op);
+ auto TempOtherOp = peekThroughBitcasts(OtherOp);
- // If there is a size mismatch, then we must use masking on at least one
- // operand
- if (OpIs16Bit != OtherOpIs16Bit)
+ auto OpIs16Bit =
+ TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
+ if (!OpIs16Bit)
return true;
- // If both operands are 16 bit, return whether or not we cleanly address both
- if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
- return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+ auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
+ isExtendedFrom16Bits(TempOtherOp);
+ if (!OtherOpIs16Bit)
+ return true;
- // Both are 32 bit operands
- return true;
+ // Do we cleanly address both
+ return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
}
SDValue SITargetLowering::performOrCombine(SDNode *N,
@@ -10822,8 +10833,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
// TODO support constantZero
- if (!P || P->isConstantZero())
+ if (!P || P->isConstantZero()) {
return SDValue();
+ }
PermNodes.push_back(*P);
}
@@ -10832,7 +10844,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
int FirstSrc = 0;
std::optional<int> SecondSrc;
- uint64_t permMask = 0x00000000;
+ uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
@@ -10843,15 +10855,15 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
if (SecondSrc.has_value())
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
return SDValue();
+
// Set the index of the second distinct Src node
SecondSrc = i;
- assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
- 32);
+ assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
SrcByteAdjust = 0;
}
assert(PermOp.SrcOffset + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());
- permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}
SDValue Op = *PermNodes[FirstSrc].Src;
@@ -10860,8 +10872,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// Check that we are not just extracting the bytes in order from an op
if (Op == OtherOp) {
- int Low16 = permMask & 0xffff;
- int Hi16 = (permMask & 0xffff0000) >> 16;
+ int Low16 = PermMask & 0xffff;
+ int Hi16 = (PermMask & 0xffff0000) >> 16;
bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
@@ -10871,10 +10883,23 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return Op;
}
- if (hasEightBitAccesses(permMask, Op, OtherOp)) {
+ if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
SDLoc DL(N);
+ assert(Op.getValueType().isByteSized() &&
+ OtherOp.getValueType().isByteSized());
+ if (Op.getValueSizeInBits() < 32)
+ // If the ultimate src is less than 32 bits, then we will only be
+ // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
+ // CalculateByteProvider would not have returned Op as source if we
+ // used a byte that is outside its ValueType. Thus, we are free to
+ // ANY_EXTEND as the extended bits are dont-cares.
+ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op);
+
+ if (OtherOp.getValueSizeInBits() < 32)
+ OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
+
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
- DAG.getConstant(permMask, DL, MVT::i32));
+ DAG.getConstant(PermMask, DL, MVT::i32));
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 700325859151ed..c354f783f57662 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1263,18 +1263,13 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
; GFX9-LABEL: test_ret_v3bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v3bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
ret <3 x bfloat> %in
@@ -1802,9 +1797,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -1841,11 +1833,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 52f6ca52d6b23b..aa034cc185b474 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -266,9 +266,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
@@ -311,9 +311,9 @@ define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -696,9 +696,9 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1006,9 +1006,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1300,9 +1300,9 @@ define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1399,8 +1399,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -1851,9 +1851,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2069,9 +2069,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -2678,10 +2678,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3)
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x1000504
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 3e8e3dec7f44c7..0c8209baf09cde 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -621,10 +621,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_mov_b32_e32 v2, 0
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX803-NEXT: s_mov_b32 s4, 0x3020504
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@@ -734,12 +734,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
+; GFX803-NEXT: s_mov_b32 s4, 0x3020504
; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v4
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 378907d20738ff..234161a36ee037 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -2717,3 +2717,37 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
ret void
}
+
+define void @Source16Bit(i16 %in, <2 x i16> %reg) {
+; GFX10-LABEL: Source16Bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: Source16Bit:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3050204
+; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %elt0 = extractelement <2 x i16> %reg, i32 1
+ %e0b0 = and i16 %elt0, 255
+ %e0b1 = and i16 %elt0, -256
+ %e1b0 = and i16 %in, 255
+ %e1b1 = and i16 %in, -256
+ %tmp0 = shl i16 %e0b0, 8
+ %byte0 = or i16 %tmp0, %e1b0
+ %tmp2 = lshr i16 %e1b1, 8
+ %byte1 = or i16 %e0b1, %tmp2
+ %ext0 = zext i16 %byte0 to i32
+ %ext1 = zext i16 %byte1 to i32
+ %shifted = shl i32 %ext1, 16
+ %result = or i32 %shifted, %ext0
+ store i32 %result, ptr addrspace(1) undef
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index afec8f35126504..d200b25c17d33b 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -150,8 +150,8 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
; VI-LABEL: trunc_v2i64_arg_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b32 s4, 0x1000504
+; VI-NEXT: v_perm_b32 v0, v0, v2, s4
; VI-NEXT: s_setpc_b64 s[30:31]
%trunc = trunc <2 x i64> %arg0 to <2 x i16>
ret <2 x i16> %trunc
More information about the llvm-commits
mailing list