[llvm] d0e54e3 - [AMDGPU] Extend CalculateByteProvider to capture vectors and signed
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 11 08:58:58 PDT 2023
Author: Jeffrey Byrnes
Date: 2023-08-11T08:47:17-07:00
New Revision: d0e54e377b5771f3cafc8a4772710f25dc0ac437
URL: https://github.com/llvm/llvm-project/commit/d0e54e377b5771f3cafc8a4772710f25dc0ac437
DIFF: https://github.com/llvm/llvm-project/commit/d0e54e377b5771f3cafc8a4772710f25dc0ac437.diff
LOG: [AMDGPU] Extend CalculateByteProvider to capture vectors and signed
Differential Revision: https://reviews.llvm.org/D157133
Change-Id: I9ba8727b4ac5a627de2f7d87d2169eb79e01f0ee
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
llvm/test/CodeGen/AMDGPU/permute_i8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7d1f3d5f50e65..0008cedb832211 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10458,6 +10458,25 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG: {
+ SDValue NarrowOp = Op->getOperand(0);
+ auto NarrowVT = NarrowOp.getValueType();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowVT = VTSign->getVT();
+ }
+ if (!NarrowVT.isByteSized())
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
+
+ if (SrcIndex >= NarrowByteWidth)
+ return std::nullopt;
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10497,7 +10516,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
unsigned BitWidth = Op.getScalarValueSizeInBits();
if (BitWidth % 8 != 0)
return std::nullopt;
- assert(Index < BitWidth / 8 && "invalid index requested");
+ if (Index > BitWidth / 8 - 1)
+ return std::nullopt;
switch (Op.getOpcode()) {
case ISD::OR: {
@@ -10540,6 +10560,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
}
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10586,9 +10607,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
- case ISD::ZERO_EXTEND: {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::AssertZext:
+ case ISD::AssertSext: {
SDValue NarrowOp = Op->getOperand(0);
- unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+ unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ Op->getOpcode() == ISD::AssertZext ||
+ Op->getOpcode() == ISD::AssertSext) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowBitWidth = VTSign->getVT().getSizeInBits();
+ }
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -10602,10 +10632,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::TRUNCATE: {
- unsigned NarrowBitWidth = Op.getScalarValueSizeInBits();
- if (NarrowBitWidth % 8 != 0)
- return std::nullopt;
- uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+ uint64_t NarrowByteWidth = BitWidth / 8;
if (NarrowByteWidth >= Index) {
return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
@@ -10616,10 +10643,6 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::CopyFromReg: {
- auto BitWidth = Op.getScalarValueSizeInBits();
- if (BitWidth % 8)
- llvm_unreachable("Invalid type in CopyFromReg");
-
if (BitWidth / 8 > Index)
return calculateSrcByte(Op, StartingIndex, Index);
@@ -10653,6 +10676,23 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
case ISD::BSWAP:
return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
Depth + 1, StartingIndex);
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+ if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+ Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
+ }
+
+ return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
+ StartingIndex, Index);
+ }
+
default: {
return std::nullopt;
}
@@ -10854,9 +10894,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
// TODO support constantZero
- if (!P || P->isConstantZero()) {
+ if (!P || P->isConstantZero())
return SDValue();
- }
PermNodes.push_back(*P);
}
@@ -10892,7 +10931,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
: *PermNodes[FirstSrc].Src;
// Check that we are not just extracting the bytes in order from an op
- if (Op == OtherOp) {
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
@@ -10901,13 +10940,19 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// The perm op would really just produce Op. So combine into Op
if (WellFormedLow && WellFormedHi)
- return Op;
+ return DAG.getBitcast(MVT::getIntegerVT(32), Op);
}
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
SDLoc DL(N);
assert(Op.getValueType().isByteSized() &&
OtherOp.getValueType().isByteSized());
+
+ // Handle potential vectors
+ Op = DAG.getBitcast(MVT::getIntegerVT(Op.getValueSizeInBits()), Op);
+ OtherOp = DAG.getBitcast(
+ MVT::getIntegerVT(OtherOp.getValueSizeInBits()), OtherOp);
+
if (Op.getValueSizeInBits() < 32)
// If the ultimate src is less than 32 bits, then we will only be
// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
@@ -10917,7 +10962,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op);
if (OtherOp.getValueSizeInBits() < 32)
- OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
+ OtherOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, OtherOp);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(PermMask, DL, MVT::i32));
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
index 8b5753cc4e2ccc..83b650e2d755e2 100644
--- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -544,11 +544,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-UNPACKED-NEXT: v_perm_b32 v0, v1, v2, s0
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4
@@ -644,12 +644,11 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1
; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
+; GFX8-UNPACKED-NEXT: v_perm_b32 v3, v3, v4, s0
+; GFX8-UNPACKED-NEXT: v_perm_b32 v2, v1, v2, s0
+; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 878cf49a40c1bc..4fa5b6cf843c1f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1692,18 +1692,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_lshl_b32 s0, s4, 16
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_perm_b32 v0, v0, s4, v4
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1850,18 +1849,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_lshl_b32 s0, s4, 16
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_perm_b32 v1, v1, s4, v4
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 0fe97188bbf6a9..7167dd93be8025 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -232,9 +232,9 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4
; TONGA-NEXT: s_wqm_b64 exec, exec
; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
+; TONGA-NEXT: s_mov_b32 s0, 0x1000504
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0
; TONGA-NEXT: v_mov_b32_e32 v1, v2
; TONGA-NEXT: ; return to shader part epilog
;
@@ -282,9 +282,9 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc,
; TONGA-NEXT: v_mov_b32_e32 v6, v3
; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
+; TONGA-NEXT: s_mov_b32 s0, 0x1000504
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0
; TONGA-NEXT: v_mov_b32_e32 v1, v5
; TONGA-NEXT: v_mov_b32_e32 v2, v6
; TONGA-NEXT: ; return to shader part epilog
@@ -368,11 +368,10 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4
; TONGA-NEXT: s_wqm_b64 exec, exec
; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16
+; TONGA-NEXT: s_mov_b32 s0, 0x1000504
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; TONGA-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0
+; TONGA-NEXT: v_perm_b32 v1, v2, v3, s0
; TONGA-NEXT: ; return to shader part epilog
;
; GFX81-LABEL: image_sample_b_2d_v4f16:
@@ -419,11 +418,10 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc,
; TONGA-NEXT: v_mov_b32_e32 v7, v3
; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
+; TONGA-NEXT: s_mov_b32 s0, 0x1000504
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; TONGA-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0
+; TONGA-NEXT: v_perm_b32 v1, v5, v6, s0
; TONGA-NEXT: v_mov_b32_e32 v2, v7
; TONGA-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 234161a36ee037..2d8a64e6bcbc80 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -2751,3 +2751,218 @@ entry:
store i32 %result, ptr addrspace(1) undef
ret void
}
+
+define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract3744:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract3744:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3070404
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <4 x i8> %vec1, i64 0
+ %zv1e0 = zext i8 %v1e0 to i32
+ %byte1 = shl i32 %zv1e0, 8
+
+ %v1e3 = extractelement <4 x i8> %vec1, i64 3
+ %zv1e3 = zext i8 %v1e3 to i32
+ %byte2 = shl i32 %zv1e3, 16
+ %v2e3 = extractelement <4 x i8> %vec2, i64 3
+ %zv2e3 = zext i8 %v2e3 to i32
+ %byte3 = shl i32 %zv2e3, 24
+
+ %tmp0 = or i32 %zv1e0, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: extract1347_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract1347_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: global_load_dword v7, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1030407
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <2 x i16> %vec1, i64 0
+ %v1e1 = extractelement <2 x i16> %vec1, i64 1
+ %v2e0 = extractelement <2 x i16> %vec2, i64 0
+ %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+ %b0t0 = and i16 -256, %v2e1
+ %b0t1 = lshr i16 %b0t0, 8
+ %byte0 = zext i16 %b0t1 to i32
+
+ %b1t0 = and i16 255, %v2e0
+ %b1t1 = zext i16 %b1t0 to i32
+ %byte1 = shl i32 %b1t1, 8
+
+ %b2t0 = and i16 -256, %v1e1
+ %b2t1 = lshr i16 %b2t0, 8
+ %b2t2 = zext i16 %b2t1 to i32
+ %byte2 = shl i32 %b2t2, 16
+
+ %b3t0 = and i16 -256, %v1e0
+ %b3t1 = lshr i16 %b3t0, 8
+ %b3t2 = zext i16 %b3t1 to i32
+ %byte3 = shl i32 %b3t2, 24
+
+ %tmp0 = or i32 %byte0, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
+; GFX10-LABEL: shlbase:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v7, v[0:1], off
+; GFX10-NEXT: global_load_dword v8, v[2:3], off
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v6
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, v3, v2
+; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: shlbase:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v7, v[0:1], off
+; GFX9-NEXT: global_load_dword v8, v[2:3], off
+; GFX9-NEXT: v_add_u32_e32 v0, 8, v6
+; GFX9-NEXT: v_add_u32_e32 v1, 16, v6
+; GFX9-NEXT: v_add_u32_e32 v2, 24, v6
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshl_or_b32 v0, v3, v0, v3
+; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
+ %v1e0 = extractelement <4 x i8> %vec1, i64 0
+ %zv1e0 = zext i8 %v1e0 to i32
+ %b8 = add i32 %base, 8
+ %byte1 = shl i32 %zv1e0, %b8
+
+ %v1e3 = extractelement <4 x i8> %vec1, i64 3
+ %zv1e3 = zext i8 %v1e3 to i32
+ %b16 = add i32 %base, 16
+ %byte2 = shl i32 %zv1e3, %b16
+ %v2e3 = extractelement <4 x i8> %vec2, i64 3
+ %zv2e3 = zext i8 %v2e3 to i32
+ %b24 = add i32 %base, 24
+ %byte3 = shl i32 %zv2e3, %b24
+
+ %tmp0 = or i32 %zv1e0, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
+
+; TODO -- lower into v_perm
+define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) {
+; GFX10-LABEL: extractbase:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v7, v[0:1], off
+; GFX10-NEXT: global_load_dword v8, v[2:3], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v0
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_bfe_u32 v2, v7, v1, 8
+; GFX10-NEXT: v_bfe_u32 v0, v7, v0, 8
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v0
+; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: global_store_dword v[4:5], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extractbase:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v7, v[0:1], off
+; GFX9-NEXT: global_load_dword v8, v[2:3], off
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v6
+; GFX9-NEXT: v_add_u32_e32 v1, 24, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_bfe_u32 v0, v7, v0, 8
+; GFX9-NEXT: v_bfe_u32 v2, v7, v1, 8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0
+; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
+ %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
+ %v1b = extractelement <4 x i8> %vec1, i64 %base
+ %zv1b = zext i8 %v1b to i32
+ %byte1 = shl i32 %zv1b, 8
+
+ %b3 = add i64 %base, 3
+ %v1b3 = extractelement <4 x i8> %vec1, i64 %b3
+ %zv1b3 = zext i8 %v1b3 to i32
+ %byte2 = shl i32 %zv1b3, 16
+ %v2b3 = extractelement <4 x i8> %vec2, i64 %b3
+ %zv2b3 = zext i8 %v2b3 to i32
+ %byte3 = shl i32 %zv2b3, 24
+
+ %tmp0 = or i32 %zv1b, %byte1
+ %tmp1 = or i32 %tmp0, %byte2
+ %res = or i32 %tmp1, %byte3
+ store i32 %res, ptr addrspace(1) %out0, align 4
+ ret void
+}
More information about the llvm-commits
mailing list