[llvm] 7d4baf2 - [AMDGPU] Add maximum NSA size limit ISA feature
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 23 00:17:13 PDT 2021
Author: Carl Ritson
Date: 2021-07-23T16:16:06+09:00
New Revision: 7d4baf25aae5dd3610310d655ae5ec2881fb2d81
URL: https://github.com/llvm/llvm-project/commit/7d4baf25aae5dd3610310d655ae5ec2881fb2d81
DIFF: https://github.com/llvm/llvm-project/commit/7d4baf25aae5dd3610310d655ae5ec2881fb2d81.diff
LOG: [AMDGPU] Add maximum NSA size limit ISA feature
Add maximum NSA size limit as an ISA feature.
Use this to reduce NSA usage on GFX10.1 to avoid stability issues
with 4 and 5 dwords NSA instructions.
Maintain use of longer NSA instructions on GFX10.3.
Note: this also contains some minor fixes for GlobalISel which
did not work correctly with non-NSA form instructions on GFX10.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D103348
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/cluster_stores.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index c1903ddd39b16..7991f3d2a6b23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -629,6 +629,16 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
"Does not need SW waitstates"
>;
+class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
+ "nsa-max-size-"#Value,
+ "NSAMaxSize",
+ !cast<string>(Value),
+ "The maximum non-sequential address size in VGPRs."
+>;
+
+def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
+def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1031,6 +1041,7 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
@@ -1052,6 +1063,7 @@ def FeatureISAVersion10_1_1 : FeatureSet<
FeatureDot6Insts,
FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
@@ -1073,6 +1085,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
FeatureDot6Insts,
FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
@@ -1090,6 +1103,7 @@ def FeatureISAVersion10_1_3 : FeatureSet<
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
@@ -1113,6 +1127,7 @@ def FeatureISAVersion10_3_0 : FeatureSet<
FeatureDot6Insts,
FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize13,
FeatureWavefrontSize32,
FeatureShaderCyclesRegister]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c52439e8b258d..85a36b1c348b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4153,12 +4153,11 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
Register AddrReg = SrcOp.getReg();
- if (I < Intr->GradientStart) {
- AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
- PackedAddrs.push_back(AddrReg);
- } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
- (I >= Intr->CoordStart && !IsA16)) {
+ if ((I < Intr->GradientStart) ||
+ (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
+ (I >= Intr->CoordStart && !IsA16)) {
// Handle any gradient or coordinate operands that should not be packed
+ AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
@@ -4202,9 +4201,8 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int NumAddrRegs = AddrRegs.size();
if (NumAddrRegs != 1) {
- // Round up to 8 elements for v5-v7
- // FIXME: Missing intermediate sized register classes and instructions.
- if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+ // Above 8 elements round up to next power of 2 (i.e. 16).
+ if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
auto Undef = B.buildUndef(S32);
AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
@@ -4375,7 +4373,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
IsG16);
// See also below in the non-a16 branch
- const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
+ PackedRegs.size() <= ST.getNSAMaxSize();
if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
@@ -4412,7 +4411,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
+ CorrectedNumVAddrs <= ST.getNSAMaxSize();
if (!UseNSA && Intr->NumVAddrs > 1)
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 7e5f0d0d5257d..0c5020dccecd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -264,6 +264,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasGFX10A16(false),
HasG16(false),
HasNSAEncoding(false),
+ NSAMaxSize(0),
GFX10_AEncoding(false),
GFX10_BEncoding(false),
HasDLInsts(false),
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index dc53568c1b9d4..bd0c40081c01f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -136,6 +136,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasGFX10A16;
bool HasG16;
bool HasNSAEncoding;
+ unsigned NSAMaxSize;
bool GFX10_AEncoding;
bool GFX10_BEncoding;
bool HasDLInsts;
@@ -878,6 +879,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasNSAEncoding() const { return HasNSAEncoding; }
+ unsigned getNSAMaxSize() const { return NSAMaxSize; }
+
bool hasGFX10_AEncoding() const {
return GFX10_AEncoding;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f33e7e1f5b8c4..17f855310ed0e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6211,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- bool UseNSA =
- ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
+ VAddrs.size() >= 3 &&
+ VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
SDValue VAddr;
if (!UseNSA)
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
index cc5259a9f2ae1..7f25e8e2d9dbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@@ -1669,7 +1669,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[COPY28]](s32)
; GFX10: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY29]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -3511,7 +3512,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource")
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource")
; GFX10: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
@@ -3606,7 +3608,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32)
; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index f02ab648442c3..f74d1d4686a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -27,7 +27,8 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -69,7 +70,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -119,7 +122,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32)
; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -159,7 +166,8 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -203,7 +211,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -242,7 +252,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -285,7 +297,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -326,7 +341,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -371,7 +388,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -409,7 +430,8 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -451,7 +473,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -491,7 +515,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -535,7 +560,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -574,7 +601,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -617,7 +646,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -658,7 +690,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32)
; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -703,7 +737,11 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32)
; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32)
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
@@ -750,7 +788,11 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource")
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+ ; GFX10: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource")
; GFX10: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
@@ -793,7 +835,11 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32)
; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32)
- ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
+ ; GFX10: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32)
+ ; GFX10: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32)
+ ; GFX10: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32)
+ ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
+ ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource")
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GFX10: $vgpr0 = COPY [[UV]](s32)
; GFX10: $vgpr1 = COPY [[UV1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 000b0047c13ae..88ab397f4b7e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -505,7 +505,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX6-NEXT: s_mov_b32 s11, s13
; GFX6-NEXT: s_wqm_b64 exec, exec
; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
index fa92c03780a60..9792459169859 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
@@ -161,7 +161,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX6-NEXT: s_mov_b32 s11, s13
; GFX6-NEXT: s_wqm_b64 exec, exec
; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_c_cl_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
@@ -255,7 +255,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX6-NEXT: s_mov_b32 s11, s13
; GFX6-NEXT: s_wqm_b64 exec, exec
; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_c_b_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
@@ -299,7 +299,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX6-NEXT: s_mov_b32 s9, s11
; GFX6-NEXT: s_mov_b32 s10, s12
; GFX6-NEXT: s_mov_b32 s11, s13
-; GFX6-NEXT: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_b_cl_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
@@ -343,7 +343,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i3
; GFX6-NEXT: s_mov_b32 s11, s13
; GFX6-NEXT: s_wqm_b64 exec, exec
; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
-; GFX6-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:5], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
@@ -428,7 +428,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX6-NEXT: s_mov_b32 s9, s11
; GFX6-NEXT: s_mov_b32 s10, s12
; GFX6-NEXT: s_mov_b32 s11, s13
-; GFX6-NEXT: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1
+; GFX6-NEXT: image_gather4_c_l_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index ed58051c51ba8..9f79ac2bf80f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -35,15 +35,17 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
-; GFX10-NEXT: v_and_or_b32 v0, v0, v9, v1
-; GFX10-NEXT: v_and_or_b32 v1, v2, v9, s12
-; GFX10-NEXT: v_and_or_b32 v2, v3, v9, v4
-; GFX10-NEXT: v_and_or_b32 v3, v5, v9, s12
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12
+; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4
+; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1
+; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12
+; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -131,12 +133,15 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4
-; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4
+; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -255,12 +260,15 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
-; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4
-; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4
+; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3
+; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -271,12 +279,17 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mov_b32_e32 v11, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5
-; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -287,12 +300,17 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mov_b32_e32 v11, v4
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
-; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5
-; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5
+; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1
+; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 1d4b4740ebae8..45b6c44f9ae49 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -351,21 +351,28 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
;
; GFX10-LABEL: cluster_image_sample:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v13, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_cvt_f32_i32_e32 v8, v0
+; GFX10-NEXT: v_cvt_f32_i32_e32 v9, v1
; GFX10-NEXT: v_mov_b32_e32 v10, 1.0
-; GFX10-NEXT: v_add_f32_e32 v11, 1.0, v2
-; GFX10-NEXT: v_add_f32_e32 v12, 1.0, v3
-; GFX10-NEXT: v_add_f32_e32 v14, 2.0, v2
-; GFX10-NEXT: v_add_f32_e32 v15, 2.0, v3
-; GFX10-NEXT: image_sample_d v[2:5], [v11, v12, v13, v13, v13, v13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: image_sample_d v[6:9], [v14, v15, v10, v10, v10, v10], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_add_f32_e32 v2, 1.0, v8
+; GFX10-NEXT: v_add_f32_e32 v3, 1.0, v9
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_mov_b32_e32 v7, v4
+; GFX10-NEXT: v_add_f32_e32 v8, 2.0, v8
+; GFX10-NEXT: v_add_f32_e32 v9, 2.0, v9
+; GFX10-NEXT: v_mov_b32_e32 v11, v10
+; GFX10-NEXT: v_mov_b32_e32 v12, v10
+; GFX10-NEXT: v_mov_b32_e32 v13, v10
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: image_sample_d v[14:17], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: image_sample_d v[18:21], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_add_f32_e32 v5, v17, v21
+; GFX10-NEXT: v_add_f32_e32 v4, v16, v20
+; GFX10-NEXT: v_add_f32_e32 v3, v15, v19
+; GFX10-NEXT: v_add_f32_e32 v2, v14, v18
; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 86c6bae06204f..6433dd78a3d22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NONSA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010,NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030,NSA %s
; GCN-LABEL: {{^}}sample_2d:
;
@@ -24,7 +25,8 @@ main_body:
}
; GCN-LABEL: {{^}}sample_d_3d:
-; NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
+; GFX1010: image_sample_d v[0:3], v[7:22],
+; GFX1030: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -33,8 +35,8 @@ main_body:
; GCN-LABEL: {{^}}sample_contig_nsa:
; NONSA: image_sample_c_l v5, v[0:4],
-; NSA: image_sample_c_l v8, v[0:4],
-; NSA: image_sample v9, [v6, v7, v5],
+; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -45,8 +47,8 @@ main_body:
}
; GCN-LABEL: {{^}}sample_nsa_nsa:
-; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
-; NSA: image_sample v9, [v6, v7, v5],
+; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
+; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -57,8 +59,8 @@ main_body:
}
; GCN-LABEL: {{^}}sample_nsa_contig:
-; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
-; NSA: image_sample v9, v[5:7],
+; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
+; NSA: image_sample v{{[0-9]+}}, v[5:7],
define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -69,10 +71,10 @@ main_body:
}
; GCN-LABEL: {{^}}sample_contig_contig:
-; NSA: image_sample_c_l v8, v[0:4],
-; NSA: image_sample v9, v[5:7],
-; NONSA: image_sample_c_l v8, v[0:4],
-; NONSA: image_sample v9, v[5:7],
+; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NSA: image_sample v{{[0-9]+}}, v[5:7],
+; NONSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
+; NONSA: image_sample v{{[0-9]+}}, v[5:7],
define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 0a50684cddb3e..6ee84aa4704fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -600,14 +600,17 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT: v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT: v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10-NEXT: v_mov_b32_e32 v12, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v10, v5
+; GFX10-NEXT: v_and_b32_e32 v5, v2, v6
+; GFX10-NEXT: v_and_b32_e32 v3, v2, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v5
+; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -1156,14 +1159,17 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_mov_b32_e32 v13, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v6
+; GFX10-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4
+; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -1190,14 +1196,17 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
;
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_mov_b32_e32 v13, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v6
+; GFX10-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4
+; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index ffc1308a4b2ec..0bc7f5093b0e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1739,14 +1739,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
-; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
+; GFX10-NEXT: v_mov_b32_e32 v9, v11 ; encoding: [0x0b,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v12 ; encoding: [0x0c,0x03,0x14,0x7e]
+; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x04,0xe9,0xf0,0x00,0x09,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v11, v1, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x0c,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT: global_store_dword v11, v10, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x0a,0x0c,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -1807,14 +1806,14 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
;
; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v11, v0 ; encoding: [0x00,0x03,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, v1 ; encoding: [0x01,0x03,0x14,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: image_sample_c_d_o v[0:2], [v11, v10, v9, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x06,0xe9,0xf0,0x0b,0x00,0x40,0x00,0x0a,0x09,0x03,0x04,0x05,0x06,0x07,0x08]
+; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e]
+; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x06,0xe9,0xf0,0x00,0x09,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e]
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
index 12b0cb7311a89..b507d9fc0049c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll
@@ -47,9 +47,16 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, half %s, half %t, half %r) {
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: image_sample_d v[0:3], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10-NEXT: v_mov_b32_e32 v15, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT: v_mov_b32_e32 v13, v5
+; GFX10-NEXT: v_mov_b32_e32 v12, v4
+; GFX10-NEXT: v_mov_b32_e32 v11, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT: image_sample_d v[0:3], v[8:15], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -132,9 +139,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_mov_b32_e32 v11, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v7, v1
+; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0
+; GFX10-NEXT: image_sample_d_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -177,9 +189,15 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
; GFX10-LABEL: sample_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_mov_b32_e32 v13, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX10-NEXT: v_mov_b32_e32 v11, v4
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v8, v1
+; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0
+; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -302,9 +320,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_mov_b32_e32 v11, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v7, v1
+; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0
+; GFX10-NEXT: image_sample_cd_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -347,9 +370,15 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_mov_b32_e32 v13, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX10-NEXT: v_mov_b32_e32 v11, v4
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v8, v1
+; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0
+; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -371,9 +400,16 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_mov_b32_e32 v15, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT: v_mov_b32_e32 v13, v5
+; GFX10-NEXT: v_mov_b32_e32 v12, v4
+; GFX10-NEXT: v_mov_b32_e32 v11, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -395,9 +431,16 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
-; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_mov_b32_e32 v15, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6
+; GFX10-NEXT: v_mov_b32_e32 v13, v5
+; GFX10-NEXT: v_mov_b32_e32 v12, v4
+; GFX10-NEXT: v_mov_b32_e32 v11, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0
+; GFX10-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
@@ -489,26 +532,30 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
; GFX10-LABEL: sample_g16_noa16_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT: v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v9, v2, v9
+; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
+; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10GISEL-LABEL: sample_g16_noa16_d_3d:
; GFX10GISEL: ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
-; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v9, v1
-; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v9, s12
-; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v9, v4
-; GFX10GISEL-NEXT: v_and_or_b32 v3, v5, v9, s12
-; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12
+; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4
+; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1
+; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12
+; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -636,23 +683,28 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_g16_noa16_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v8, v2
-; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v8, v4
-; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4
+; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -828,23 +880,28 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_g16_noa16_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v8, v2
-; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v8, v4
-; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4
+; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3
+; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -855,23 +912,32 @@ main_body:
define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
; GFX10GISEL: ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v9, v3
-; GFX10GISEL-NEXT: v_and_or_b32 v3, v4, v9, v5
-; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -882,23 +948,32 @@ main_body:
define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
; GFX10GISEL: ; %bb.0: ; %main_body
-; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
+; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2
+; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v9, v3
-; GFX10GISEL-NEXT: v_and_or_b32 v3, v4, v9, v5
-; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5
+; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1
+; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index bbbda67a06dd0..75990337c8a38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -31,12 +31,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 ; encoding: [0x09,0x07,0x06,0x36]
-; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 ; encoding: [0x09,0x01,0x00,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36]
+; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -112,12 +114,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
-; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -220,12 +224,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
-; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -236,12 +242,16 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -252,12 +262,16 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index 24701d211d933..e0a18eb9ae66f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -31,12 +31,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v9, v3
-; GFX10-NEXT: v_and_b32_e32 v0, v9, v0
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v9, v2, v9
+; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9
+; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
+; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -112,12 +114,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -220,12 +224,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
-; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
+; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -236,12 +242,16 @@ main_body:
define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -252,12 +262,16 @@ main_body:
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
-; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
+; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
+; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
More information about the llvm-commits
mailing list