[llvm] 9c92864 - AMDGPU: Fix interaction of tfe and d16
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 22 06:26:23 PST 2020
Author: Matt Arsenault
Date: 2020-01-22T09:26:17-05:00
New Revision: 9c928649a085646c4c779bac095643b50b464d83
URL: https://github.com/llvm/llvm-project/commit/9c928649a085646c4c779bac095643b50b464d83
DIFF: https://github.com/llvm/llvm-project/commit/9c928649a085646c4c779bac095643b50b464d83.diff
LOG: AMDGPU: Fix interaction of tfe and d16
This using the wrong result register, and dropping the result entirely
for v2f16. This would fail to select on the scalar case. I believe it
was also mishandling packed/unpacked subtargets.
Added:
llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 24f099d711ef..4c68397e5bad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5215,6 +5215,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
return Value == 0;
}
+static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
+ SDValue Src, int ExtraElts) {
+ EVT SrcVT = Src.getValueType();
+
+ SmallVector<SDValue, 8> Elts;
+
+ if (SrcVT.isVector())
+ DAG.ExtractVectorElements(Src, Elts);
+ else
+ Elts.push_back(Src);
+
+ SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
+ while (ExtraElts--)
+ Elts.push_back(Undef);
+
+ return DAG.getBuildVector(CastVT, DL, Elts);
+}
+
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
@@ -5226,76 +5244,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
const SDLoc &DL, LLVMContext &Context) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
- EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
- EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
- EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
- : AdjEltVT
- : ReqRetVT;
-
- // Extract data part of the result
- // Bitcast the result to the same type as the required return type
- int NumElts;
- if (IsD16 && !Unpacked)
- NumElts = NumVDataDwords << 1;
- else
- NumElts = NumVDataDwords;
+ int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+ ReqRetNumElts : (ReqRetNumElts + 1) / 2;
- EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
- : AdjEltVT;
+ int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+ DMaskPop : (DMaskPop + 1) / 2;
- // Special case for v6f16. Rather than add support for this, use v3i32 to
- // extract the data elements
- bool V6F16Special = false;
- if (NumElts == 6) {
- CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
- DMaskPop >>= 1;
- ReqRetNumElts >>= 1;
- V6F16Special = true;
- AdjVT = MVT::v2i32;
- }
+ MVT DataDwordVT = NumDataDwords == 1 ?
+ MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
- SDValue N = SDValue(Result, 0);
- SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+ MVT MaskPopVT = MaskPopDwords == 1 ?
+ MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
- // Iterate over the result
- SmallVector<SDValue, 4> BVElts;
+ SDValue Data(Result, 0);
+ SDValue TexFail;
- if (CastVT.isVector()) {
- DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
- } else {
- BVElts.push_back(CastRes);
- }
- int ExtraElts = ReqRetNumElts - DMaskPop;
- while(ExtraElts--)
- BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+ if (IsTexFail) {
+ SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
+ if (MaskPopVT.isVector()) {
+ Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
+ SDValue(Result, 0), ZeroIdx);
+ } else {
+ Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
+ SDValue(Result, 0), ZeroIdx);
+ }
- SDValue PreTFCRes;
- if (ReqRetNumElts > 1) {
- SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
- if (IsD16 && Unpacked)
- PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
- else
- PreTFCRes = NewVec;
- } else {
- PreTFCRes = BVElts[0];
+ TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ SDValue(Result, 0),
+ DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}
- if (V6F16Special)
- PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+ if (DataDwordVT.isVector())
+ Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
+ NumDataDwords - MaskPopDwords);
- if (!IsTexFail) {
- if (Result->getNumValues() > 1)
- return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
- else
- return PreTFCRes;
- }
+ if (IsD16)
+ Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
+
+ if (!ReqRetVT.isVector())
+ Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+
+ Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+
+ if (TexFail)
+ return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+
+ if (Result->getNumValues() == 1)
+ return Data;
- // Extract the TexFail result and insert into aggregate return
- SmallVector<SDValue, 1> TFCElt;
- DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
- SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
- return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+ return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
}
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
@@ -5545,8 +5543,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
EVT NewVT = NumVDataDwords > 1 ?
- EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
- : MVT::f32;
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
+ : MVT::i32;
ResultTypes[0] = NewVT;
if (ResultTypes.size() == 3) {
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
new file mode 100644
index 000000000000..30e4a1e645b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -0,0 +1,410 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
+
+define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_f16_tfe_dmask0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_f16_tfe_dmask0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { half, i32 } %v, 0
+ %v.err = extractvalue { half, i32 } %v, 1
+ store volatile half %v.data, half addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_f16_tfe_dmask1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_f16_tfe_dmask1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { half, i32 } %v, 0
+ %v.err = extractvalue { half, i32 } %v, 1
+ store volatile half %v.data, half addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { <2 x half>, i32 } %v, 0
+ %v.err = extractvalue { <2 x half>, i32 } %v, 1
+ store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { <2 x half>, i32 } %v, 0
+ %v.err = extractvalue { <2 x half>, i32 } %v, 1
+ store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { <2 x half>, i32 } %v, 0
+ %v.err = extractvalue { <2 x half>, i32 } %v, 1
+ store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
+; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; %v.data = extractvalue { <3 x half>, i32 } %v, 0
+; %v.err = extractvalue { <3 x half>, i32 } %v, 1
+; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
+; store volatile i32 %v.err, i32 addrspace(1)* undef
+; ret void
+; }
+
+define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
+; GFX10-NEXT: global_store_dword v[0:1], v3, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { <4 x half>, i32 } %v, 0
+ %v.err = extractvalue { <4 x half>, i32 } %v, 1
+ store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
+
+declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+attributes #0 = { nounwind readonly }
More information about the llvm-commits
mailing list