[llvm] [AMDGPU]: Accept constant zero bytes in v_perm OrCombine (PR #66533)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 15 10:35:58 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
This provides capability for combine to produce perms with masks containing 0x0c
---
Patch is 47.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66533.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+56-7)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+120-122)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/load-hi16.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/load-lo16.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/load-local.128.ll (+25-24)
- (modified) llvm/test/CodeGen/AMDGPU/load-local.96.ll (+19-18)
- (modified) llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (+7-9)
- (modified) llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (+6-9)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c85ec3f9f5212f..18cd159780e9d1c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10700,6 +10700,23 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+ if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+ SrcIndex = ScalarSize == 8 ? VecIdx : VecIdx * 2 + SrcIndex;
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ // The scalar is 32 bits, so just use the scalar
+ return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+ }
+
default: {
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
@@ -10958,6 +10975,9 @@ static bool addresses16Bits(int Mask) {
int Low8 = Mask & 0xff;
int Hi8 = (Mask & 0xff00) >> 8;
+ if (Low8 == 0x0c || Hi8 == 0x0c)
+ return false;
+
assert(Low8 < 8 && Hi8 < 8);
// Are the bytes contiguous in the order of increasing addresses.
bool IsConsecutive = (Hi8 - Low8 == 1);
@@ -11052,12 +11072,33 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// If all the uses of an or need to extract the individual elements, do not
// attempt to lower into v_perm
auto usesCombinedOperand = [](SDNode *OrUse) {
+ // The combined bytes seem to be getting extracted
+ if (OrUse->getOpcode() == ISD::SRL || OrUse->getOpcode() == ISD::TRUNCATE)
+ return false;
+
+ if (OrUse->getOpcode() == ISD::AND) {
+ auto SelectMask = dyn_cast<ConstantSDNode>(OrUse->getOperand(1));
+ if (SelectMask && (SelectMask->getZExtValue() == 0xFF))
+ return false;
+ }
+
+ if (OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE0 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE1 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE2 ||
+ OrUse->getOpcode() == AMDGPUISD::CVT_F32_UBYTE3) {
+ return false;
+ }
+
+ if (auto StoreUse = dyn_cast<StoreSDNode>(OrUse))
+ if (StoreUse->isTruncatingStore() &&
+ StoreUse->getMemoryVT().getSizeInBits() == 8)
+ return false;
+
// If we have any non-vectorized use, then it is a candidate for v_perm
- if (OrUse->getOpcode() != ISD::BITCAST ||
- !OrUse->getValueType(0).isVector())
+ if (!(OrUse->getValueType(0).isVector() &&
+ OrUse->getOpcode() != ISD::BUILD_VECTOR))
return true;
- // If we have any non-vectorized use, then it is a candidate for v_perm
for (auto VUse : OrUse->uses()) {
if (!VUse->getValueType(0).isVector())
return true;
@@ -11119,8 +11160,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
- // TODO support constantZero
- if (!P || P->isConstantZero())
+ if (!P)
return SDValue();
PermNodes.push_back(*P);
@@ -11128,11 +11168,17 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
if (PermNodes.size() != 4)
return SDValue();
- int FirstSrc = 0;
+ size_t FirstSrc = 0;
std::optional<int> SecondSrc;
uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];
+ if (PermOp.isConstantZero()) {
+ if (FirstSrc == i)
+ ++FirstSrc;
+ PermMask |= 0x0c << (i * 8);
+ continue;
+ }
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
// by sizeof(Src2) = 4
int SrcByteAdjust = 4;
@@ -11152,6 +11198,10 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}
+ SDLoc DL(N);
+ if (PermMask == 0x0c0c0c0c)
+ return DAG.getConstant(0, DL, MVT::i32);
+
SDValue Op = *PermNodes[FirstSrc].Src;
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
: *PermNodes[FirstSrc].Src;
@@ -11170,7 +11220,6 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
- SDLoc DL(N);
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 901cbd4a5272059..a02d11533a988f1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1428,7 +1428,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s8, 0x4000405
+; VI-NEXT: s_mov_b32 s8, 0xc0c0004
+; VI-NEXT: s_mov_b32 s9, 0x4000405
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
@@ -1438,35 +1439,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ubyte v6, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4
+; VI-NEXT: flat_load_ubyte v7, v[2:3]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[0:1]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[2:3]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
+; VI-NEXT: v_perm_b32 v3, v7, v6, s8
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
-; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_perm_b32 v0, v1, v0, s8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
+; VI-NEXT: v_perm_b32 v4, v3, v0, s9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v3
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_perm_b32 v4, v4, v5, s8
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -1475,24 +1472,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v7, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2
-; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3
-; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3
+; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] offset:3
+; GFX10-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX10-NEXT: v_perm_b32 v5, v1, v2, 0xc0c0004
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4
+; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v5
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x4000405
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
-; GFX10-NEXT: global_store_dword v7, v4, s[2:3]
+; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dword v6, v4, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1505,16 +1502,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3
; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2
-; GFX9-NEXT: s_mov_b32 s4, 0x4000405
+; GFX9-NEXT: s_mov_b32 s4, 0xc0c0004
+; GFX9-NEXT: s_mov_b32 s5, 0x4000405
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX9-NEXT: v_perm_b32 v0, v1, v3, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
+; GFX9-NEXT: v_perm_b32 v1, v4, v2, s4
+; GFX9-NEXT: v_perm_b32 v4, v0, v1, s5
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4
; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dword v5, v4, s[2:3]
; GFX9-NEXT: s_endpgm
@@ -1527,19 +1525,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2
-; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3
-; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3
+; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] offset:3
+; GFX11-NEXT: global_load_u8 v3, v0, s[6:7] offset:3
; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; GFX11-NEXT: v_perm_b32 v4, v1, v2, 0xc0c0004
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v5, v0, v3, 0xc0c0004
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v4
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX11-NEXT: global_store_b32 v6, v4, s[2:3]
@@ -1794,43 +1793,46 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_mov_b32 s4, 0xc0c0004
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v10, v[2:3]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0
-; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
+; VI-NEXT: flat_load_ubyte v9, v[2:3]
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: flat_load_ubyte v7, v[8:9]
-; VI-NEXT: flat_load_ubyte v8, v[2:3]
-; VI-NEXT: flat_load_ubyte v2, v[0:1]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v9, v[0:1]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_perm_b32 v7, v8, v9, s4
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_perm_b32 v1, v6, v10, s4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v3, v1
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -1839,90 +1841,86 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
-; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
-; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4
+; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
-; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4
-; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
-; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
-; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
+; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT: v_perm_b32 v8, v6, v5, 0xc0c0004
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v1
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8
+; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4
-; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2
-; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1
-; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3]
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] offset:4
+; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:6
+; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v6, v0, s[2:3] offset:3
+; GFX9-NEXT: s_mov_b32 s2, 0xc0c0004
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8
+; GFX9-NEXT: v_perm_b32 v0, v5, v4, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
-; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
+; GFX9-NEXT: v_perm_b32 v8, v3, v6, s2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v1
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v1
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v8
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: load_v7i8_to_v7f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6
-; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3
-; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:4
+; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:6
+; GFX11-NEXT: global_load_u8 v3, v0, s[2:3] offset:3
+; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:2
; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1
-; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4
; GFX11-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/66533
More information about the llvm-commits
mailing list