[llvm] 72ccec1 - [AMDGPU] Fix v3f16 interaction with image store workaround
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 18 09:21:15 PST 2020
Author: Sebastian Neubauer
Date: 2020-11-18T18:21:04+01:00
New Revision: 72ccec1bbc98407d66275c5911ee557adb461f5a
URL: https://github.com/llvm/llvm-project/commit/72ccec1bbc98407d66275c5911ee557adb461f5a
DIFF: https://github.com/llvm/llvm-project/commit/72ccec1bbc98407d66275c5911ee557adb461f5a.diff
LOG: [AMDGPU] Fix v3f16 interaction with image store workaround
In some cases, the wrong amount of registers was reserved.
Also enable more v3f16 tests.
Differential Revision: https://reviews.llvm.org/D90847
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8cadf725fc92..11476ee39764 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3560,9 +3560,9 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
auto Unmerge = B.buildUnmerge(S16, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
- PackedRegs.resize(8, B.buildUndef(S16).getReg(0));
- Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0);
- return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0);
+ PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
+ Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
+ return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 594ffc7ea22a..9f81d534e5fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7455,17 +7455,6 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());
- } else if (NumElements == 3) {
- EVT IntStoreVT =
- EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
- SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
-
- EVT WidenedStoreVT = EVT::getVectorVT(
- *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
- EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
- WidenedStoreVT.getStoreSizeInBits());
- SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
- return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
}
// The sq block of gfx8.1 does not estimate register use correctly for d16
@@ -7488,9 +7477,17 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
PackedElts.push_back(IntPair);
}
+ if ((NumElements % 2) == 1) {
+ // Handle v3i16
+ unsigned I = Elts.size() / 2;
+ SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
+ {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
+ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+ PackedElts.push_back(IntPair);
+ }
// Pad using UNDEF
- PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));
+ PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
// Build final vector
EVT VecVT =
@@ -7498,6 +7495,19 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
return DAG.getBuildVector(VecVT, DL, PackedElts);
}
+ if (NumElements == 3) {
+ EVT IntStoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT WidenedStoreVT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
+ EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
+ WidenedStoreVT.getStoreSizeInBits());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
+ return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
+ }
+
assert(isTypeLegal(StoreVT));
return VData;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index b418fb6a2d9f..3749dcffa240 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX81 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
; PACKED-LABEL: name: image_store_f16
@@ -60,6 +62,44 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
+ ; GFX9-LABEL: name: image_store_f16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
+ ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
+ ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
+ ; GFX9: S_ENDPGM 0
+ ; GFX10-LABEL: name: image_store_f16
+ ; GFX10: bb.1 (%ir-block.0):
+ ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
+ ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
+ ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
+ ; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@@ -128,6 +168,42 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
+ ; GFX9-LABEL: name: image_store_v2f16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
+ ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
+ ; GFX9: S_ENDPGM 0
+ ; GFX10-LABEL: name: image_store_v2f16
+ ; GFX10: bb.1 (%ir-block.0):
+ ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
+ ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
+ ; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@@ -245,12 +321,78 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
- ; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
- ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
- ; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>)
- ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
+ ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
+ ; GFX81: [[BITCAST4:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST4]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; GFX81: S_ENDPGM 0
+ ; GFX9-LABEL: name: image_store_v3f16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
+ ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+ ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+ ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+ ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32)
+ ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
+ ; GFX9: S_ENDPGM 0
+ ; GFX10-LABEL: name: image_store_v3f16
+ ; GFX10: bb.1 (%ir-block.0):
+ ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
+ ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+ ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+ ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+ ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GFX10: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32)
+ ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
+ ; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@@ -329,6 +471,46 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
+ ; GFX9-LABEL: name: image_store_v4f16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
+ ; GFX9: S_ENDPGM 0
+ ; GFX10-LABEL: name: image_store_v4f16
+ ; GFX10: bb.1 (%ir-block.0):
+ ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
+ ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
+ ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
+ ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
+ ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
+ ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
+ ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
+ ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+ ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
+ ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
+ ; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 18aa54dcf827..df5a26defd53 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -504,11 +504,95 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) {
ret <2 x half> %v
}
-; FIXME:
-; define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
-; %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; ret <3 x half> %v
-; }
+define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX8-UNPACKED-LABEL: load_1d_v3f16_xyz:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2
+; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9
+; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16
+; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff
+; GFX8-UNPACKED-NEXT: s_and_b32 s1, s0, s0
+; GFX8-UNPACKED-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, s0, v1
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: ; return to shader part epilog
+;
+; GFX8-PACKED-LABEL: load_1d_v3f16_xyz:
+; GFX8-PACKED: ; %bb.0:
+; GFX8-PACKED-NEXT: s_mov_b32 s0, s2
+; GFX8-PACKED-NEXT: s_mov_b32 s1, s3
+; GFX8-PACKED-NEXT: s_mov_b32 s2, s4
+; GFX8-PACKED-NEXT: s_mov_b32 s3, s5
+; GFX8-PACKED-NEXT: s_mov_b32 s4, s6
+; GFX8-PACKED-NEXT: s_mov_b32 s5, s7
+; GFX8-PACKED-NEXT: s_mov_b32 s6, s8
+; GFX8-PACKED-NEXT: s_mov_b32 s7, s9
+; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16
+; GFX8-PACKED-NEXT: s_mov_b32 s0, 0xffff
+; GFX8-PACKED-NEXT: s_and_b32 s0, s0, s0
+; GFX8-PACKED-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-PACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-PACKED-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-PACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-PACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-PACKED-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_1d_v3f16_xyz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s0, s2
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_mov_b32 s2, s4
+; GFX9-NEXT: s_mov_b32 s3, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s8
+; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT: s_lshl_b32 s0, s0, 16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: load_1d_v3f16_xyz:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 16
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2
+; GFX10-NEXT: ; return to shader part epilog
+ %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret <3 x half> %v
+}
define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v4f16_xyzw:
@@ -712,13 +796,72 @@ define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s
ret float %vv
}
-; FIXME:
-; define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) {
-; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; %v.err = extractvalue { <3 x half>, i32 } %v, 1
-; %vv = bitcast i32 %v.err to float
-; ret float %vv
-; }
+define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2
+; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9
+; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-UNPACKED-NEXT: ; return to shader part epilog
+;
+; GFX8-PACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz:
+; GFX8-PACKED: ; %bb.0:
+; GFX8-PACKED-NEXT: s_mov_b32 s0, s2
+; GFX8-PACKED-NEXT: s_mov_b32 s1, s3
+; GFX8-PACKED-NEXT: s_mov_b32 s2, s4
+; GFX8-PACKED-NEXT: s_mov_b32 s3, s5
+; GFX8-PACKED-NEXT: s_mov_b32 s4, s6
+; GFX8-PACKED-NEXT: s_mov_b32 s5, s7
+; GFX8-PACKED-NEXT: s_mov_b32 s6, s8
+; GFX8-PACKED-NEXT: s_mov_b32 s7, s9
+; GFX8-PACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16
+; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-PACKED-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: load_1d_v3f16_tfe_dmask_xyz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s0, s2
+; GFX9-NEXT: s_mov_b32 s1, s3
+; GFX9-NEXT: s_mov_b32 s2, s4
+; GFX9-NEXT: s_mov_b32 s3, s5
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s8
+; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: load_1d_v3f16_tfe_dmask_xyz:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: ; return to shader part epilog
+ %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.err = extractvalue { <3 x half>, i32 } %v, 1
+ %vv = bitcast i32 %v.err to float
+ ret float %vv
+}
define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
index 06006bd830f2..7e11446fd37d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s
+; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
; UNPACKED-LABEL: image_store_f16:
@@ -16,19 +18,6 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
; UNPACKED-NEXT: s_endpgm
;
-; PACKED-LABEL: image_store_f16:
-; PACKED: ; %bb.0:
-; PACKED-NEXT: s_mov_b32 s0, s2
-; PACKED-NEXT: s_mov_b32 s1, s3
-; PACKED-NEXT: s_mov_b32 s2, s4
-; PACKED-NEXT: s_mov_b32 s3, s5
-; PACKED-NEXT: s_mov_b32 s4, s6
-; PACKED-NEXT: s_mov_b32 s5, s7
-; PACKED-NEXT: s_mov_b32 s6, s8
-; PACKED-NEXT: s_mov_b32 s7, s9
-; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
-; PACKED-NEXT: s_endpgm
-;
; GFX81-LABEL: image_store_f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2
@@ -60,19 +49,6 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
; UNPACKED-NEXT: s_endpgm
;
-; PACKED-LABEL: image_store_v2f16:
-; PACKED: ; %bb.0:
-; PACKED-NEXT: s_mov_b32 s0, s2
-; PACKED-NEXT: s_mov_b32 s1, s3
-; PACKED-NEXT: s_mov_b32 s2, s4
-; PACKED-NEXT: s_mov_b32 s3, s5
-; PACKED-NEXT: s_mov_b32 s4, s6
-; PACKED-NEXT: s_mov_b32 s5, s7
-; PACKED-NEXT: s_mov_b32 s6, s8
-; PACKED-NEXT: s_mov_b32 s7, s9
-; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16
-; PACKED-NEXT: s_endpgm
-;
; GFX81-LABEL: image_store_v2f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2
@@ -89,11 +65,44 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
ret void
}
-; FIXME: Broken
-; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
-; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
-; ret void
-; }
+define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
+; UNPACKED-LABEL: image_store_v3f16:
+; UNPACKED: ; %bb.0:
+; UNPACKED-NEXT: v_mov_b32_e32 v5, v1
+; UNPACKED-NEXT: v_mov_b32_e32 v1, v2
+; UNPACKED-NEXT: s_mov_b32 s0, s2
+; UNPACKED-NEXT: s_mov_b32 s1, s3
+; UNPACKED-NEXT: s_mov_b32 s2, s4
+; UNPACKED-NEXT: s_mov_b32 s3, s5
+; UNPACKED-NEXT: s_mov_b32 s4, s6
+; UNPACKED-NEXT: s_mov_b32 s5, s7
+; UNPACKED-NEXT: s_mov_b32 s6, s8
+; UNPACKED-NEXT: s_mov_b32 s7, s9
+; UNPACKED-NEXT: v_mov_b32_e32 v4, v0
+; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; UNPACKED-NEXT: image_store v[1:3], v[4:5], s[0:7] dmask:0x7 unorm d16
+; UNPACKED-NEXT: s_endpgm
+;
+; GFX81-LABEL: image_store_v3f16:
+; GFX81: ; %bb.0:
+; GFX81-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX81-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX81-NEXT: s_mov_b32 s0, s2
+; GFX81-NEXT: s_mov_b32 s1, s3
+; GFX81-NEXT: s_mov_b32 s2, s4
+; GFX81-NEXT: s_mov_b32 s3, s5
+; GFX81-NEXT: s_mov_b32 s4, s6
+; GFX81-NEXT: s_mov_b32 s5, s7
+; GFX81-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX81-NEXT: s_mov_b32 s6, s8
+; GFX81-NEXT: s_mov_b32 s7, s9
+; GFX81-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX81-NEXT: v_mov_b32_e32 v4, 0
+; GFX81-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
+; GFX81-NEXT: s_endpgm
+ call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
; UNPACKED-LABEL: image_store_v4f16:
@@ -114,19 +123,6 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16
; UNPACKED-NEXT: s_endpgm
;
-; PACKED-LABEL: image_store_v4f16:
-; PACKED: ; %bb.0:
-; PACKED-NEXT: s_mov_b32 s0, s2
-; PACKED-NEXT: s_mov_b32 s1, s3
-; PACKED-NEXT: s_mov_b32 s2, s4
-; PACKED-NEXT: s_mov_b32 s3, s5
-; PACKED-NEXT: s_mov_b32 s4, s6
-; PACKED-NEXT: s_mov_b32 s5, s7
-; PACKED-NEXT: s_mov_b32 s6, s8
-; PACKED-NEXT: s_mov_b32 s7, s9
-; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16
-; PACKED-NEXT: s_endpgm
-;
; GFX81-LABEL: image_store_v4f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
index 06607e2a518e..ad9c55d7c1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll
@@ -72,6 +72,12 @@ main_body:
ret float %x
}
+
+; GCN-LABEL: {{^}}image_load_3d_v3f16:
+; UNPACKED: image_load v[0:2], v[0:2], s[0:7] dmask:0x7 unorm d16
+; PACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16
+; GFX81: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16
+; GFX10: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_3d_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
main_body:
%tex = call <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32 7, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -103,6 +109,11 @@ main_body:
ret void
}
+; GCN-LABEL: {{^}}image_store_v3f16:
+; UNPACKED: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
+; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm d16
+; GFX81: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
+; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:
%r = bitcast <2 x float> %in to <4 x half>
More information about the llvm-commits
mailing list